Azure ML service sample.¶

This notebook illustrates invoking the webservice for the three way classifier with the Arvix data. The three way example uses three classifiers: a multiclass neural net, a multiclass logistic regression and a random forest classifier. The three way classifier uses a popularity vote system. If any two of the classifiers agree that is the main reply of the three way. the third case is always put forward as a second choice.
The python code for the three way python module is here

import pandas as pd
labels = { 1: "'bio'", 2: "'compsci'", 3: "'finance'", 4: "'math'"}
   #The entry point function can contain up to two input arguments:
   #Param: a pandas.DataFrame
   #Param: a pandas.DataFrame
def azureml_main(dataframe1 = None, dataframe2 = None):

    tclass = dataframe1["Col1"]
    scored1 = dataframe1["Scored Labels"]
    scored2 = dataframe2["Scored Labels"]
    scored3 = dataframe2["Scored Labels (2)"]
    scored = []
    second = []     
    lclass = []
    for i in range(0, len(tclass)):
        lclass.extend([tclass[i]])
        if scored2[i] == scored3[i]:
            scored.extend([labels[scored2[i]]])
            second.extend([labels[scored1[i]]])
        elif scored2[i] == scored1[i]:
            scored.extend([labels[scored1[i]]])
            second.extend([labels[scored3[i]]])
        elif scored1[i] == scored3[i]:
            scored.extend([labels[scored1[i]]])
            second.extend([labels[scored2[i]]])
        else:
            scored.extend([labels[scored1[i]]])
            second.extend([labels[scored3[i]]])

    data = {'Col1': lclass, 'Scored Labels': scored, 'second': second}
    df = pd.DataFrame(data, columns=['Col1', 'Scored Labels', 'second'])
    return df

The code below assumes that the test file is store in Azure blob store as a csv file. It uses the three way classifier and the neural net classifier and the logistic regression classifier to compare the confusion matrix of each.

import sys
import azure
import socket
import csv
import urllib2
import json
import pandas as pd
import numpy as np
import urllib
import json
import pickle
import unicodedata

The following establishes the urls and keys for each of the three classifiers

url = []
api_key = []
# the first is the neural net versionn
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/281bbba94a9b47608351313de7c34fac/execute?api-version=2.0&details=true ')
api_key.append('5dtOV9rJgoI8LtnX4N7vNMfhW4vdSPHcpCq1IUNQyTPTSwzIHQ4NADF1KkKvI/Q4mJSVFQh7JQcVyVxk7yuG/Q==')
# the second is the logistic regression version
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/08e77a94e7e0444687286a98f889be3d/execute?api-version=2.0&details=true ')
api_key.append('RkHxsJXpnBFv3CVeL5B927VlKgUbQZBaHp8BbQ2QeMgPkCqUG+W9QY9RanTO3zMXnTywpQBX+db5+n6xxWkugA==')
# this is the url and key for the threeway version
url.append('https://ussouthcentral.services.azureml.net/workspaces/38d1439e6956413e9fe2950b6530c117/services/462fccc1e35e451e96d2678e66930654/execute?api-version=2.0&details=true ')
api_key.append('e7iaNA+Pi9KdKZ70+Vv7JvZ81E3vKfqQShFuMf+a89GfGrixAHNq4HaHJZhpCY3TBujJmJh3hfKNtoYY3eKvnw==')

def sendrequest(datalist, url, api_key):
    #datalist is a list ["class", "document", "title"]
    data =  {
    "Inputs": {

            "input1":
            {
                "ColumnNames": ["Col1", "Col2", "Col3"],
                "Values": [ datalist  ]
            },        },
        "GlobalParameters": { }
    }


    body = str.encode(json.dumps(data))

    headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

    req = urllib2.Request(url, body, headers) 

    try:
        response = urllib2.urlopen(req)

        # If you are using Python 3+, replace urllib2 with urllib.request in the above code:
        # req = urllib.request.Request(url, body, headers) 
        # response = urllib.request.urlopen(req)

        result = response.read()
        #print(result) 
        y = json.loads(result)
        return(y["Results"]["output1"]["value"]["Values"] ) 

    except urllib2.HTTPError, error:
        print("The request failed with status code: " + str(error.code))

        # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
        print(error.info())

        print(json.loads(error.read()))

#this function is used to send multiple requests to the web service with one invocation.
def sendbulkrequest(datalist, url, api_key):
    #datalist is a list ["class", "document", "title"]
    data =  {
        "Inputs": {

                "input1":
                {
                    "ColumnNames": ["Col1", "Col2", "Col3"],
                    "Values":  datalist  
                },        },
            "GlobalParameters": { }
    }

    body = str.encode(json.dumps(data))
    headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
    
    req = urllib2.Request(url, body, headers) 
    
    try:
        response = urllib2.urlopen(req)

        # If you are using Python 3+, replace urllib2 with urllib.request in the above code:
        # req = urllib.request.Request(url, body, headers) 
        # response = urllib.request.urlopen(req)

        result = response.read()
        y = json.loads(result)
        return(y["Results"]["output1"]["value"]["Values"] ) 
    except urllib2.HTTPError, error:
        print("The request failed with status code: " + str(error.code))

        # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
        print(error.info())

        return json.loads(error.read())

def read_azure_blob(subj, base):
    #base is the url for the azure blob store
    docpath =base+ "/"+subj+".csv"
    response = urllib2.urlopen(docpath)
    data = csv.reader(response)
    biglist =  []
    for row in data:
        biglist.append(row)
    return biglist

#first read the test data file arxivnophy
base = "https://scimldata.blob.core.windows.net/arxiv"
nophytable = read_azure_blob('arxivnophy', base)

next test it with the first entry in the table with the three way.

x = sendrequest(nophytable[0], url[2], api_key[2])
print x

[[u"'compsci'", u"'compsci'", u"'compsci'"]]

len(x)

1

multx = sendbulkrequest(nophytable, url[2], api_key[2])

Let's see how well the three way does if we ask if either the first or second choice is correct.

correct = {"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}
num = {"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}
for i in range(len(multx)):
    x = multx[i]
    key = x[0]
    num[key] = num[key]+1.0
    if (key==x[1]) or (key==x[2]):
        correct[key] = correct[key] +1.0
    #else:
for key in correct:
    correct[key] = 100.0*correct[key]/num[key]
print num
print correct

{"'bio'": 316.0, "'finance'": 232.0, "'compsci'": 648.0, "'math'": 742.0}
{"'bio'": 65.82278481012658, "'finance'": 60.3448275862069, "'compsci'": 72.37654320987654, "'math'": 88.27493261455525}

def make_confusion_table(multx):
    tble = {"'compsci'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0,  "'math'": 0}, 
            "'finance'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0},
            "'bio'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0,  "'math'": 0},
            "'math'":{"'compsci'": 0, "'finance'": 0, "'bio'": 0, "'math'": 0}}
    predicted = []
    tclass = []
    for e in multx:
        x = e[0]
        tclass.extend([x])
        sc = e[1]
        predicted.extend([sc])
    for i in range(0, len(tclass)):
        if tclass[i] == 'none':
            print "found none"
        else:
           tble[predicted[i]][tclass[i]] = tble[predicted[i]][tclass[i]] +1 
    names = ["'bio'", "'compsci'", "'finance'", "'math'"]
    for x in names:
        total = 0
        for y in names:
            total = total + tble[y][x]
        print "total for " +x +"= "+str(total)
        for y in names:
            tble[y][x] = 100.0*tble[y][x]/(1.0*total)
    df = pd.DataFrame(tble, columns=[ "'bio'", "'compsci'", "'finance'", "'math'"])
    return df

df = make_confusion_table(multx)
df

total for 'bio'= 316
total for 'compsci'= 648
total for 'finance'= 232
total for 'math'= 742

#next do the confusion matrix for the neural net classifer
multx2 = sendbulkrequest(nophytable, url[0], api_key[0])

df = make_confusion_table(multx2)
df

total for 'bio'= 316
total for 'compsci'= 648
total for 'finance'= 232
total for 'math'= 742

# now for the logistic regression service
multx3 = sendbulkrequest(nophytable, url[1], api_key[1])

df = make_confusion_table(multx3)
df

total for 'bio'= 316
total for 'compsci'= 648
total for 'finance'= 232
total for 'math'= 742

as we can see, all three are rather poor. the three way is only slightly better.

	'bio'	'compsci'	'finance'	'math'
'bio'	50.316456	20.886076	0.949367	27.848101
'compsci'	4.938272	62.654321	1.543210	30.864198
'finance'	5.603448	9.913793	47.844828	36.637931
'math'	3.908356	13.477089	2.291105	80.323450

	'bio'	'compsci'	'finance'	'math'
'bio'	51.265823	19.936709	4.746835	24.050633
'compsci'	10.493827	57.716049	4.320988	27.469136
'finance'	6.465517	17.241379	50.431034	25.862069
'math'	6.469003	16.037736	5.525606	71.967655

	'bio'	'compsci'	'finance'	'math'
'bio'	54.113924	23.417722	0.316456	22.151899
'compsci'	6.172840	65.740741	1.234568	26.851852
'finance'	5.172414	9.913793	43.534483	41.379310
'math'	3.504043	14.555256	2.021563	79.919137