forked from echen/link-prediction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict_links.py
79 lines (65 loc) · 2.74 KB
/
predict_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import sys
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
"""
Given a training set of examples and associated features...
- Each example is of the form (src, dest, 1 if src follows dest else 0)
- Features are things like # of followers of src, Jaccard similarity
between src and dest nodes, etc.
...train a machine learning classifier on this set.
Then apply this same classifier on a set of test src nodes, to form
a ranked prediction of which dest nodes each src is likely to follow.
"""
# Use this file to train the classifier.
#
# The first column in this file is the truth of a (src, dest) edge
# (i.e., 1 if the edge is known to exist, 0 otherwise).
# The rest of the columns are features on that edge.
TRAINING_SET_WITH_FEATURES_FILENAME = "my_data/my_ml_training_set_with_features.csv"
# This file contains candidate edge pairs to score, along with
# features on these candidate edges.
#
# The first column is the src node, the second is the dest node,
# the rest of the columns are features.
CANDIDATES_TO_SCORE_FILENAME = "my_data/my_candidates_with_features.csv"
########################################
# STEP 1: Read in the training examples.
########################################
truths = [] # A truth is 1 (for a known true edge) or 0 (for a false edge).
training_examples = [] # Each training example is an array of features.
for line in open(TRAINING_SET_WITH_FEATURES_FILENAME):
fields = [float(x) for x in line.split(",")]
truth = fields[0]
training_example_features = fields[1:]
truths.append(truth)
training_examples.append(training_example_features)
#############################
# STEP 2: Train a classifier.
#############################
clf = RandomForestClassifier(n_estimators = 500, compute_importances = True, oob_score = True)
clf = clf.fit(training_examples, truths)
###############################
# STEP 3: Score the candidates.
###############################
BATCH_SIZE = 10000
src_dest_nodes = []
examples = []
predictions = []
for line in open(CANDIDATES_TO_SCORE_FILENAME):
fields = [float(feature) for feature in line.split(",")]
src = fields[0]
dest = fields[1]
src_dest_nodes.append((src, dest))
example_features = fields[2:]
examples.append(example_features)
if len(examples) == BATCH_SIZE:
predictions = clf.predict_proba(examples)
for i in xrange(batch_size):
print ",".join([str(x) for x in [src_dest_nodes[i][0], src_dest_nodes[i][1], predictions[i][1]]])
examples = []
predictions = []
src_dest_nodes = []
predictions = clf.predict_proba(examples)
for i in xrange(len(predictions)):
print ",".join([str(x) for x in [src_dest_nodes[i][0], src_dest_nodes[i][1], predictions[i][1]]])