This notebook illustrates invoking the webservice for the three way classifier with the Arvix data. The three way example uses three classifiers: a multiclass neural net, a multiclass logistic regression and a random forest classifier. The three way classifier uses a popularity vote system. If any two of the classifiers agree that is the main reply of the three way. the third case is always put forward as a second choice.
The python code for the three way python module is here
import pandas as pd
labels = { 1: "'bio'", 2: "'compsci'", 3: "'finance'", 4: "'math'"}
#The entry point function can contain up to two input arguments:
#Param: a pandas.DataFrame
#Param: a pandas.DataFrame
def azureml_main(dataframe1 = None, dataframe2 = None):
tclass = dataframe1["Col1"]
scored1 = dataframe1["Scored Labels"]
scored2 = dataframe2["Scored Labels"]
scored3 = dataframe2["Scored Labels (2)"]
scored = []
second = []
lclass = []
for i in range(0, len(tclass)):
lclass.extend([tclass[i]])
if scored2[i] == scored3[i]:
scored.extend([labels[scored2[i]]])
second.extend([labels[scored1[i]]])
elif scored2[i] == scored1[i]:
scored.extend([labels[scored1[i]]])
second.extend([labels[scored3[i]]])
elif scored1[i] == scored3[i]:
scored.extend([labels[scored1[i]]])
second.extend([labels[scored2[i]]])
else:
scored.extend([labels[scored1[i]]])
second.extend([labels[scored3[i]]])
data = {'Col1': lclass, 'Scored Labels': scored, 'second': second}
df = pd.DataFrame(data, columns=['Col1', 'Scored Labels', 'second'])
return df
The code below assumes that the test file is store in Azure blob store as a csv file. It uses the three way classifier and the neural net classifier and the logistic regression classifier to compare the confusion matrix of each.
import sys
import azure
import socket
import csv
import urllib2
import json
import pandas as pd
import numpy as np
import urllib
import json
import pickle
import unicodedata
The following establishes the urls and keys for each of the three classifiers
url = []
api_key = []
# the first is the neural net versionn
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/281bbba94a9b47608351313de7c34fac/execute?api-version=2.0&details=true ')
api_key.append('5dtOV9rJgoI8LtnX4N7vNMfhW4vdSPHcpCq1IUNQyTPTSwzIHQ4NADF1KkKvI/Q4mJSVFQh7JQcVyVxk7yuG/Q==')
# the second is the logistic regression version
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/08e77a94e7e0444687286a98f889be3d/execute?api-version=2.0&details=true ')
api_key.append('RkHxsJXpnBFv3CVeL5B927VlKgUbQZBaHp8BbQ2QeMgPkCqUG+W9QY9RanTO3zMXnTywpQBX+db5+n6xxWkugA==')
# this is the url and key for the threeway version
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/462fccc1e35e451e96d2678e66930654/execute?api-version=2.0&details=true ')
api_key.append('e7iaNA+Pi9KdKZ70+Vv7JvZ81E3vKfqQShFuMf+a89GfGrixAHNq4HaHJZhpCY3TBujJmJh3hfKNtoYY3eKvnw==')
def sendrequest(datalist, url, api_key):
#datalist is a list ["class", "document", "title"]
data = {
"Inputs": {
"input1":
{
"ColumnNames": ["Col1", "Col2", "Col3"],
"Values": [ datalist ]
}, },
"GlobalParameters": { }
}
body = str.encode(json.dumps(data))
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
req = urllib2.Request(url, body, headers)
try:
response = urllib2.urlopen(req)
# If you are using Python 3+, replace urllib2 with urllib.request in the above code:
# req = urllib.request.Request(url, body, headers)
# response = urllib.request.urlopen(req)
result = response.read()
#print(result)
y = json.loads(result)
return(y["Results"]["output1"]["value"]["Values"] )
except urllib2.HTTPError, error:
print("The request failed with status code: " + str(error.code))
# Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
print(error.info())
print(json.loads(error.read()))
#this function is used to send multiple requests to the web service with one invocation.
def sendbulkrequest(datalist, url, api_key):
#datalist is a list ["class", "document", "title"]
data = {
"Inputs": {
"input1":
{
"ColumnNames": ["Col1", "Col2", "Col3"],
"Values": datalist
}, },
"GlobalParameters": { }
}
body = str.encode(json.dumps(data))
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
req = urllib2.Request(url, body, headers)
try:
response = urllib2.urlopen(req)
# If you are using Python 3+, replace urllib2 with urllib.request in the above code:
# req = urllib.request.Request(url, body, headers)
# response = urllib.request.urlopen(req)
result = response.read()
y = json.loads(result)
return(y["Results"]["output1"]["value"]["Values"] )
except urllib2.HTTPError, error:
print("The request failed with status code: " + str(error.code))
# Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
print(error.info())
return json.loads(error.read())
def read_azure_blob(subj, base):
#base is the url for the azure blob store
docpath =base+ "/"+subj+".csv"
response = urllib2.urlopen(docpath)
data = csv.reader(response)
biglist = []
for row in data:
biglist.append(row)
return biglist
#first read the test data file arxivnophy
base = "https://scimldata.blob.core.windows.net/arxiv"
nophytable = read_azure_blob('arxivnophy', base)
next test it with the first entry in the table with the three way.
x = sendrequest(nophytable[0], url[2], api_key[2])
print x
len(x)
multx = sendbulkrequest(nophytable, url[2], api_key[2])
Let's see how well the three way does if we ask if either the first or second choice is correct.
correct = {"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}
num = {"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}
for i in range(len(multx)):
x = multx[i]
key = x[0]
num[key] = num[key]+1.0
if (key==x[1]) or (key==x[2]):
correct[key] = correct[key] +1.0
#else:
for key in correct:
correct[key] = 100.0*correct[key]/num[key]
print num
print correct
def make_confusion_table(multx):
tble = {"'compsci'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0},
"'finance'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0},
"'bio'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0},
"'math'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}}
predicted = []
tclass = []
for e in multx:
x = e[0]
tclass.extend([x])
sc = e[1]
predicted.extend([sc])
for i in range(0, len(tclass)):
if tclass[i] == 'none':
print "found none"
else:
tble[predicted[i]][tclass[i]] = tble[predicted[i]][tclass[i]] +1
names = ["'bio'", "'compsci'", "'finance'", "'math'"]
for x in names:
total = 0
for y in names:
total = total + tble[y][x]
print "total for " +x +"= "+str(total)
for y in names:
tble[y][x] = 100.0*tble[y][x]/(1.0*total)
df = pd.DataFrame(tble, columns=[ "'bio'", "'compsci'", "'finance'", "'math'"])
return df
df = make_confusion_table(multx)
df
#next do the confusion matrix for the neural net classifer
multx2 = sendbulkrequest(nophytable, url[0], api_key[0])
df = make_confusion_table(multx2)
df
# now for the logistic regression service
multx3 = sendbulkrequest(nophytable, url[1], api_key[1])
df = make_confusion_table(multx3)
df
as we can see, all three are rather poor. the three way is only slightly better.