Azure ML service sample.

This notebook illustrates invoking the webservice for the three way classifier with the Arvix data. The three way example uses three classifiers: a multiclass neural net, a multiclass logistic regression and a random forest classifier. The three way classifier uses a popularity vote system. If any two of the classifiers agree that is the main reply of the three way. the third case is always put forward as a second choice.
The python code for the three way python module is here

import pandas as pd
labels = { 1: "'bio'", 2: "'compsci'", 3: "'finance'", 4: "'math'"}
   #The entry point function can contain up to two input arguments:
   #Param: a pandas.DataFrame
   #Param: a pandas.DataFrame
def azureml_main(dataframe1 = None, dataframe2 = None):

    tclass = dataframe1["Col1"]
    scored1 = dataframe1["Scored Labels"]
    scored2 = dataframe2["Scored Labels"]
    scored3 = dataframe2["Scored Labels (2)"]
    scored = []
    second = []     
    lclass = []
    for i in range(0, len(tclass)):
        lclass.extend([tclass[i]])
        if scored2[i] == scored3[i]:
            scored.extend([labels[scored2[i]]])
            second.extend([labels[scored1[i]]])
        elif scored2[i] == scored1[i]:
            scored.extend([labels[scored1[i]]])
            second.extend([labels[scored3[i]]])
        elif scored1[i] == scored3[i]:
            scored.extend([labels[scored1[i]]])
            second.extend([labels[scored2[i]]])
        else:
            scored.extend([labels[scored1[i]]])
            second.extend([labels[scored3[i]]])

    data = {'Col1': lclass, 'Scored Labels': scored, 'second': second}
    df = pd.DataFrame(data, columns=['Col1', 'Scored Labels', 'second'])
    return df

The code below assumes that the test file is store in Azure blob store as a csv file. It uses the three way classifier and the neural net classifier and the logistic regression classifier to compare the confusion matrix of each.

In [34]:
import sys
import azure
import socket
import csv
import urllib2
import json
import pandas as pd
import numpy as np
import urllib
import json
import pickle
import unicodedata
In [ ]:
The following establishes the urls and keys for each of the three classifiers
In [76]:
url = []
api_key = []
# the first is the neural net versionn
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/281bbba94a9b47608351313de7c34fac/execute?api-version=2.0&details=true ')
api_key.append('5dtOV9rJgoI8LtnX4N7vNMfhW4vdSPHcpCq1IUNQyTPTSwzIHQ4NADF1KkKvI/Q4mJSVFQh7JQcVyVxk7yuG/Q==')
# the second is the logistic regression version
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/08e77a94e7e0444687286a98f889be3d/execute?api-version=2.0&details=true ')
api_key.append('RkHxsJXpnBFv3CVeL5B927VlKgUbQZBaHp8BbQ2QeMgPkCqUG+W9QY9RanTO3zMXnTywpQBX+db5+n6xxWkugA==')
# this is the url and key for the threeway version
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/462fccc1e35e451e96d2678e66930654/execute?api-version=2.0&details=true ')
api_key.append('e7iaNA+Pi9KdKZ70+Vv7JvZ81E3vKfqQShFuMf+a89GfGrixAHNq4HaHJZhpCY3TBujJmJh3hfKNtoYY3eKvnw==')
In [3]:
def sendrequest(datalist, url, api_key):
    #datalist is a list ["class", "document", "title"]
    data =  {
    "Inputs": {

            "input1":
            {
                "ColumnNames": ["Col1", "Col2", "Col3"],
                "Values": [ datalist  ]
            },        },
        "GlobalParameters": { }
    }


    body = str.encode(json.dumps(data))

    headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

    req = urllib2.Request(url, body, headers) 

    try:
        response = urllib2.urlopen(req)

        # If you are using Python 3+, replace urllib2 with urllib.request in the above code:
        # req = urllib.request.Request(url, body, headers) 
        # response = urllib.request.urlopen(req)

        result = response.read()
        #print(result) 
        y = json.loads(result)
        return(y["Results"]["output1"]["value"]["Values"] ) 

    except urllib2.HTTPError, error:
        print("The request failed with status code: " + str(error.code))

        # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
        print(error.info())

        print(json.loads(error.read()))                 
        
    
In [4]:
#this function is used to send multiple requests to the web service with one invocation.
def sendbulkrequest(datalist, url, api_key):
    #datalist is a list ["class", "document", "title"]
    data =  {
        "Inputs": {

                "input1":
                {
                    "ColumnNames": ["Col1", "Col2", "Col3"],
                    "Values":  datalist  
                },        },
            "GlobalParameters": { }
    }

    body = str.encode(json.dumps(data))
    headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
    
    req = urllib2.Request(url, body, headers) 
    
    try:
        response = urllib2.urlopen(req)

        # If you are using Python 3+, replace urllib2 with urllib.request in the above code:
        # req = urllib.request.Request(url, body, headers) 
        # response = urllib.request.urlopen(req)

        result = response.read()
        y = json.loads(result)
        return(y["Results"]["output1"]["value"]["Values"] ) 
    except urllib2.HTTPError, error:
        print("The request failed with status code: " + str(error.code))

        # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
        print(error.info())

        return json.loads(error.read())            
In [38]:
def read_azure_blob(subj, base):
    #base is the url for the azure blob store
    docpath =base+ "/"+subj+".csv"
    response = urllib2.urlopen(docpath)
    data = csv.reader(response)
    biglist =  []
    for row in data:
        biglist.append(row)
    return biglist
In [42]:
#first read the test data file arxivnophy
base = "https://scimldata.blob.core.windows.net/arxiv"
nophytable = read_azure_blob('arxivnophy', base)

next test it with the first entry in the table with the three way.

In [49]:
x = sendrequest(nophytable[0], url[2], api_key[2])
print x 
[[u"'compsci'", u"'compsci'", u"'compsci'"]]
In [15]:
len(x)
Out[15]:
1
In [84]:
multx = sendbulkrequest(nophytable, url[2], api_key[2])

Let's see how well the three way does if we ask if either the first or second choice is correct.

In [85]:
correct = {"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}
num = {"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}
for i in range(len(multx)):
    x = multx[i]
    key = x[0]
    num[key] = num[key]+1.0
    if (key==x[1]) or (key==x[2]):
        correct[key] = correct[key] +1.0
    #else:
for key in correct:
    correct[key] = 100.0*correct[key]/num[key]
print num
print correct
{"'bio'": 316.0, "'finance'": 232.0, "'compsci'": 648.0, "'math'": 742.0}
{"'bio'": 65.82278481012658, "'finance'": 60.3448275862069, "'compsci'": 72.37654320987654, "'math'": 88.27493261455525}
In [72]:
def make_confusion_table(multx):
    tble = {"'compsci'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0,  "'math'": 0}, 
            "'finance'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0},
            "'bio'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0,  "'math'": 0},
            "'math'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}}
    predicted = []
    tclass = []
    for e in multx:
        x = e[0]
        tclass.extend([x])
        sc = e[1]
        predicted.extend([sc])
    for i in range(0, len(tclass)):
        if tclass[i] == 'none':
            print "found none"
        else:
           tble[predicted[i]][tclass[i]] = tble[predicted[i]][tclass[i]] +1 
    names = ["'bio'", "'compsci'", "'finance'", "'math'"]
    for x in names:
        total = 0
        for y in names:
            total = total + tble[y][x]
        print "total for " +x +"= "+str(total)
        for y in names:
            tble[y][x] = 100.0*tble[y][x]/(1.0*total)
    df = pd.DataFrame(tble, columns=[ "'bio'", "'compsci'", "'finance'", "'math'"])
    return df
In [86]:
df = make_confusion_table(multx)
df
total for 'bio'= 316
total for 'compsci'= 648
total for 'finance'= 232
total for 'math'= 742
Out[86]:
'bio' 'compsci' 'finance' 'math'
'bio' 50.316456 20.886076 0.949367 27.848101
'compsci' 4.938272 62.654321 1.543210 30.864198
'finance' 5.603448 9.913793 47.844828 36.637931
'math' 3.908356 13.477089 2.291105 80.323450
In [74]:
#next do the confusion matrix for the neural net classifer
multx2 = sendbulkrequest(nophytable, url[0], api_key[0])
In [75]:
df = make_confusion_table(multx2)
df
total for 'bio'= 316
total for 'compsci'= 648
total for 'finance'= 232
total for 'math'= 742
Out[75]:
'bio' 'compsci' 'finance' 'math'
'bio' 51.265823 19.936709 4.746835 24.050633
'compsci' 10.493827 57.716049 4.320988 27.469136
'finance' 6.465517 17.241379 50.431034 25.862069
'math' 6.469003 16.037736 5.525606 71.967655
In [80]:
# now for the logistic regression service
multx3 = sendbulkrequest(nophytable, url[1], api_key[1])
In [81]:
df = make_confusion_table(multx3)
df
total for 'bio'= 316
total for 'compsci'= 648
total for 'finance'= 232
total for 'math'= 742
Out[81]:
'bio' 'compsci' 'finance' 'math'
'bio' 54.113924 23.417722 0.316456 22.151899
'compsci' 6.172840 65.740741 1.234568 26.851852
'finance' 5.172414 9.913793 43.534483 41.379310
'math' 3.504043 14.555256 2.021563 79.919137

as we can see, all three are rather poor. the three way is only slightly better.

In [ ]: