#!pip install gensim
This code shows how to load the gensim model from the data and run it against some of our test cases.
We also include the steps necessary to generate the model if so desired.
Various tests are included to guage the accuracy of the model.
Doc2Vec is an NLP tool for representing documents as a vector and is a generalizing of the Word2Vec method. This tutorial will serve as an introduction to Doc2Vec and present ways to train and assess a Doc2Vec model.
import gensim
import os
import json
import numpy as np
from gensim.models.doc2vec import Doc2Vec
#the data is stored on Azure. There are two ways to access the data.
#one is through the Azure blob storage API.
#the other way is just pull the data through the URLs.
# you can use wget to pull these if the host running your jupyter instance has wget installed.
# otherwise just use your browser to pull them.
#!wget https://dbgannonstorage.blob.core.windows.net/algorithmiagensim/gensim_model
#!wget https://dbgannonstorage.blob.core.windows.net/algorithmiagensim/data_collection.p
#!wget https://dbgannonstorage.blob.core.windows.net/algorithmiagensim/topicdict
import azure
from azure.storage.blob import ContentSettings
from azure.storage.blob import BlockBlobService
block_blob_service = BlockBlobService(account_name='dbgannonstorage')
block_blob_service.get_blob_to_path('algorithmiagensim', 'gensim_model', 'gensim_model')
The data comes from Cornell's arXiv. It consists of collections of paper abstracts, titles and arXiv subtopic lables called here "sites".
The data that was used to train the model in labeled scimltrain*.
To use the model we need only the sciml_train_sites and this file is ordered in the same way the rows of the gensim model is. So the embedding for train abstract x is in model row x and its classification is item x in the train_sites list.
The classifications are based on author-assigned subtopic classification. we have a dictionary topicdict that will map subtopics into our main topics "Physics", "compsci", "math", "bio", "finance".
The test data comes in two pieces: one that was collected at the same time as the training data and a second collection that was collected more recently. For testing we will concatenate them together.
import pickle
block_blob_service.get_blob_to_path('algorithmiagensim', 'data_collection.p', 'data_collection.p')
data_collection = pickle.load( open( "data_collection.p", "rb" ) )
test_sites = data_collection['test_sites']
test_titles = data_collection['test_titles']
test_abstracts = data_collection['test_abstracts']
train_sites = data_collection['train_sites']
train_titles = data_collection['train_titles']
train_abstracts = data_collection['train_abstracts']
block_blob_service.get_blob_to_path('algorithmiagensim', 'topicdict', 'topicdict')
with open('topicdict', 'r') as f:
topst = f.read()
topicdict = json.loads(topst)
# one small patch to the dictionary.
topicdict['q-bio.QM'] = 'bio'
#The following code allows you to retrain the model. Setting build_model = false will
#use the pretrained model loaded from the azuure data store.
build_model = False
if (build_model):
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=55)
train_corpus = []
for i in range(len(train_abstracts)):
line = train_abstracts[i]
train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i]))
print("building vocabulary")
model.build_vocab(train_corpus)
print("doing the training")
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)
print('model trained')
model.save("gensim_model")
else:
model = Doc2Vec.load("gensim_model")
test_corpus = []
for i in range(len(test_abstracts)):
line = test_abstracts[i]
test_corpus.append(gensim.utils.simple_preprocess(line))
m = model.docvecs
mar = np.zeros((m.count, 50))
for i in range(m.count):
x = np.linalg.norm(m[i])
mar[i] = m[i]/x
We comput an embedding of the cadidate abstract into the model space. This is a somewhat stochastic process so we get different vectors each time. (To mitigate this a bit, in find-best-topic below we do the test 10 times.) We take the vector and compute the dot-product with the rows of the model. This is equivalent a cosine metric. Next we take the top five results and reject all those with a score less that 0.5. The final results are averaged and returned in the form
{'compsci': 0.0, 'Physics': 0.0, 'math': 0.0, 'bio': 0.0, 'finance': 0.0}
where the values add up to 100.
def find_best_threed( abstract ):
topics = ["compsci", "Physics", "math", "bio", "finance"]
score = {"compsci": 0.0, "Physics": 0.0, "math": 0.0, "bio": 0.0, "finance":0.0}
new = model.infer_vector(gensim.utils.simple_preprocess(abstract))
x = np.linalg.norm(new)
v0 = new/x
norms = []
for i in range(5000):
q = np.dot(v0,mar[i])
norms.append([q, i])
r = norms.sort(reverse=True)
for i in range(5):
tv = norms[i][0]
#print(tv)
if norms[i][0] < 0.5:
tv = 0
ts = train_sites[norms[i][1]]
if ts != 'nlin.CG':
score[topicdict[ts]] += tv
total = 0.0
for top in topics: total += score[top]
if total == 0:
return {"compsci": 0.0, "Physics": 0.0, "math": 0.0, "bio": 0.0, "finance":0.0}
for top in topics:
score[top] = np.round(1000*score[top]/total)
return score
def find_best_topic(abstract):
qtot = {'compsci': 0.0, 'Physics': 0.0, 'math': 0.0, 'bio': 0.0, 'finance': 0.0}
topics = ["compsci", "Physics", "math", "bio", "finance"]
for i in range(10):
q = find_best_threed(abstract)
for top in topics:
qtot[top]+=np.round(np.round(q[top]/10)/10.0)
return qtot
Notice how unbalanced the suite is. too much physics and too few bio and finance papers.
#bio_examples = []
fin_examples = []
phy_examples = []
mat_examples = []
csi_examples = []
bio_examples = []
topicdict['none'] = 'none'
for i in range(len(test_sites)):
if topicdict[test_sites[i]] == 'finance':
fin_examples.append(i)
elif topicdict[test_sites[i]] == 'Physics':
phy_examples.append(i)
elif topicdict[test_sites[i]] == 'math':
mat_examples.append(i)
elif topicdict[test_sites[i]] == 'compsci':
csi_examples.append(i)
if topicdict[test_sites[i]] == 'bio':
bio_examples.append(i)
print('finance = ',len(fin_examples))
print('physics = ',len(phy_examples))
print('compsci = ',len(csi_examples))
print('math = ',len(mat_examples))
print('bio = ',len(fin_examples))
s = fin_examples[3]
print(test_titles[s])
print(test_abstracts[s])
print("---------")
q = find_best_topic(test_abstracts[s])
print(q)
in the following cells we look at the accuracy based on standard metrics including a confusion matrix.
We first consider how often the predictor correctly selects the right label as one of the top two predictions.
we then ran it looking at the top prediction only
True Positive (TP): Correctly classified as the class of interest
True Negative (TN): Correctly classified as not the class of interest
False Positive (FP): Incorrectly classified as the class of interest
False Negative (FN): Incorrectly classified as not the class of interest
def score_area(topic):
score = 0.0
falsepos = 0.0
falseneg = 0.0
trueneg = 0.0
count = len(test_titles)
topcount = 0
topics = ["compsci", "Physics", "math", "bio", "finance"]
for i in range(count):
if topicdict[test_sites[i]] == topic: topcount += 1
q = find_best_topic(test_abstracts[i])
besttop = ''
second = ''
bestval = 0.0
secbval = 0.0
for top in topics:
if q[top] > bestval:
secbval = bestval
second = besttop
besttop = top
bestval = q[top]
elif q[top] > secbval:
secbval = q[top]
second = top
#print q
#print besttop, second, 'new_sites =', new_sites[i], q
if topicdict[test_sites[i]] == topic and (( besttop == topicdict[test_sites[i]]) or second == topicdict[test_sites[i]]):
score = score + 1
if topicdict[test_sites[i]] != topic and ((besttop == topic ) or second == topic):
falsepos += 1
if topicdict[test_sites[i]] != topic and ((besttop != topic ) and second != topic):
trueneg += 1
if topicdict[test_sites[i]] == topic and ((besttop != topic ) and second != topic):
falseneg += 1
score = score/topcount
falsepos = falsepos/(count-topcount)
trueneg = trueneg/(count-topcount)
falseneg = falseneg/topcount
print(topic, "true pos =", score)
#print(topic, "false pos =", falsepos)
#print(topic, 'true neg =', trueneg)
print(topic, 'false neg =', falseneg)
looking at the true positive and false negatives based on the top two predictions. In these cases true postive means either the first or second choice were correct and false negative means that the both the first and second choice said it was not the correct topic when it should have been classified as correct.
score_area("Physics")
score_area("bio")
score_area("math")
score_area('compsci')
score_area('finance')
This is the result when we look only at the top predction. Notice the effect of having a poorly balanced data set.
math true pos = 0.8269230769230769 math false pos = 0.18364312267657992 math true neg = 0.8163568773234201 math false neg = 0.17307692307692307
compsci true pos = 0.603125 compsci false pos = 0.05135322692574601 compsci true neg = 0.9486467730742539 compsci false neg = 0.396875
bio true pos = 0.4935064935064935 bio false pos = 0.017423771001866834 bio true neg = 0.9825762289981331 bio false neg = 0.5064935064935064
Physics true pos = 0.8618925831202046 Physics false pos = 0.05822267620020429 Physics true neg = 0.9417773237997957 Physics false neg = 0.13810741687979539
finance true pos = 0.5795454545454546 finance false pos = 0.010161386730424387 finance true neg = 0.9898386132695756 finance false neg = 0.42045454545454547
st = "Reference class forecasting is a method to remove optimism bias and strategic misrepresentation in infrastructure projects and programmes. In 2012 the Hong Kong government's Development Bureau commissioned a feasibility study on reference class forecasting in Hong Kong - a first for the Asia-Pacific region. This study involved 25 roadwork projects, for which forecast costs and durations were compared with actual outcomes. The analysis established and verified the statistical distribution of the forecast accuracy at various stages of project development, and benchmarked the projects against a sample of 863 similar projects. The study contributed to the understanding of how to improve forecasts by de-biasing early estimates, explicitly considering the risk appetite of decision makers, and safeguarding public funding allocation by balancing exceedance and under-use of project budgets. "
q = find_best_topic(st)
print(q)
st = "We consider matrix completion for recommender systems from the point of view of link prediction on graphs. Interaction data such as movie ratings can be represented by a bipartite user-item graph with labeled edges denoting observed ratings. Building on recent progress in deep learning on graph-structured data, we propose a graph auto-encoder framework based on differentiable message passing on the bipartite interaction graph. Our model shows competitive performance on standard collaborative filtering benchmarks. In settings where complimentary feature information or structured data such as a social network is available, our framework outperforms recent state-of-the-art methods."
q = find_best_topic(st)
print(q)
#compute confusion matrix
import pandas as pd
data = np.array([['','Physics','math', 'compsci', 'bio', 'finance'],
['Physics',0.,0.,0.,0.,0.],
['math',0.,0.,0.,0.,0.],
['compsci', 0.,0.,0.,0.,0.],
['bio', 0.,0.,0.,0.,0.],
['finance',0.,0.,0.,0.,0.]]
)
data2 = np.zeros((5,5))
df = pd.DataFrame(data=data2,
index=data[1:,0],
columns=data[0,1:])
count = len(test_titles)
topics = ["compsci", "Physics", "math", "bio", "finance"]
countdic = {"compsci": 0., "Physics":0., "math":0., "bio":0., "finance": 0.}
for i in range(count):
if topicdict[test_sites[i]] != 'none':
countdic[topicdict[test_sites[i]]] +=1.0
q = find_best_topic(test_abstracts[i])
besttop = ''
second = ''
bestval = 0.0
secbval = 0.0
for top in topics:
if q[top] > bestval:
secbval = bestval
second = besttop
besttop = top
bestval = q[top]
elif q[top] > secbval:
secbval = q[top]
second = top
if topicdict[test_sites[i]] != 'none':
df[topicdict[test_sites[i]]][besttop]+= 1.0
df2 = df.copy()
for top in topics:
df[top] = np.round(100*df[top]/countdic[top])
df.transpose()
#!pip install Algorithmia
we compare that with the version above. This is exactly the same model. The differences are due to the stochastic nature of the function used to map a new abstract into the model space.
import Algorithmia
client = Algorithmia.client('simboDpCMa4mA1GrC3MW95c/SKL1')
algo = client.algo('dbgannon/SciDocClassifier')
for s in range(200,210):
print(train_titles[s])
#print(train_abstracts[s])
r = algo.pipe(train_abstracts[s])
print(r.result)
q = find_best_topic(train_abstracts[s])
print(q)
print("---------")
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
RS =20150101
w_embedded = TSNE(random_state=RS).fit_transform(mar)
w_embedded.shape
N = 5000
c = np.array([0]*N)
for i in range(N): #range(len(train_corpus)):
if (train_sites[i] =='nlin.CG') or (train_sites[i] == "none"):
z = 'math'
else:
z = topicdict[train_sites[i]]
if z == 'math':
c[i] = 1
elif z == 'Physics':
c[i] = 2
elif z == 'bio':
c[i]= 3
elif z == 'finance':
c[i]= 4
elif z == 'compsci':
c[i]= 5
plt.figure(figsize=(8,6))
plt.scatter(w_embedded[:,0], w_embedded[:,1], c = c)
plt.colorbar()
plt.grid