import matplotlib
import numpy as np
import matplotlib.pyplot as plot
%matplotlib inline
import datalab.bigquery as bq
%bigquery sample --table lookerdata:cdc.project_tycho_reports --count 5
We next modify the query in the Datalab example to select for Rubella and occurances between 1972 and 1973 in Indiana and Washington.
%%sql --module rubella0
SELECT
*
FROM (
SELECT
*,
MIN(z___rank) OVER (PARTITION BY cdc_reports_epi_week) AS z___min_rank
FROM (
SELECT
*,
RANK() OVER (PARTITION BY cdc_reports_state ORDER BY cdc_reports_epi_week ) AS z___rank
FROM (
SELECT
cdc_reports.epi_week AS cdc_reports_epi_week,
cdc_reports.state AS cdc_reports_state,
COALESCE(CAST(SUM((FLOAT(cdc_reports.cases))) AS FLOAT),0) AS cdc_reports_total_cases
FROM
[lookerdata:cdc.project_tycho_reports] AS cdc_reports
WHERE
(cdc_reports.disease = 'RUBELLA')
AND (FLOOR(cdc_reports.epi_week/100) = 1972 OR FLOOR(cdc_reports.epi_week/100) = 1973)
AND (cdc_reports.state = 'IN'
OR cdc_reports.state = 'WA')
GROUP EACH BY
1,
2) ww ) aa ) xx
WHERE
z___min_rank <= 500
LIMIT
30000
now run the query and convert it to a pandas dataframe.
rubel = bq.Query(rubella).to_dataframe()
We can now grab the incedence of rubella in both states. To make the graph we will pull out the time scale and the number of cases into seperate arrays. Because the way the CDC numbers them, we need to make a few adjustments. This has to do with the fact that the CDC record the 52 week in a year as starting from the year 1972 in a code as 197201 to 197252 and 1973 as 197301 to 197352. Subtracting 197200 gives us the date codes 1..52,101..152. Subtracting 48 from 101..152 gives 53..104 so this corrects the sequence.
rubelIN = rubel[rubel['cdc_reports_state']=='IN'].sort_values(by=['cdc_reports_epi_week'])
rubelWA = rubel[rubel['cdc_reports_state']=='WA'].sort_values(by=['cdc_reports_epi_week'])
epiweekIN = rubelIN['cdc_reports_epi_week']
epiweekWA = rubelWA['cdc_reports_epi_week']
rubelINval = rubelIN['cdc_reports_total_cases']
rubelWAval = rubelWA['cdc_reports_total_cases']
realweekI = np.empty([len(epiweekIN)])
realweekI[:] = epiweekIN[:]-197200
realweekI[49:] = realweekI[49:]-48
realweekW= np.empty([len(epiweekWA)])
realweekW[:] = epiweekWA[:]-197200
realweekW[49:] = realweekW[49:]-48
realweekW
import matplotlib
import numpy as np
import matplotlib.pyplot as plot
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 20, 5
with plot.style.context('fivethirtyeight'):
plot.plot(realweekI, rubelINval, linewidth=2)
plot.plot(realweekW, rubelWAval, linewidth=2)
Next we look at the two preceding years. rubella0 above was modified to get 1970 to 71. and that data is stored in rubel0. We go through the same normalization and concatinate the strings and plot the four year span below.
%%sql --module rubella0
SELECT
*
FROM (
SELECT
*,
MIN(z___rank) OVER (PARTITION BY cdc_reports_epi_week) AS z___min_rank
FROM (
SELECT
*,
RANK() OVER (PARTITION BY cdc_reports_state ORDER BY cdc_reports_epi_week ) AS z___rank
FROM (
SELECT
cdc_reports.epi_week AS cdc_reports_epi_week,
cdc_reports.state AS cdc_reports_state,
COALESCE(CAST(SUM((FLOAT(cdc_reports.cases))) AS FLOAT),0) AS cdc_reports_total_cases
FROM
[lookerdata:cdc.project_tycho_reports] AS cdc_reports
WHERE
(cdc_reports.disease = 'RUBELLA')
AND (FLOOR(cdc_reports.epi_week/100) = 1970 OR FLOOR(cdc_reports.epi_week/100) = 1971)
AND (cdc_reports.state = 'IN'
OR cdc_reports.state = 'WA')
GROUP EACH BY
1,
2) ww ) aa ) xx
WHERE
z___min_rank <= 500
LIMIT
30000
rubel0 = bq.Query(rubella0).to_dataframe()
rubelIN0 = rubel0[rubel0['cdc_reports_state']=='IN'].sort_values(by=['cdc_reports_epi_week'])
rubelWA0 = rubel0[rubel0['cdc_reports_state']=='WA'].sort_values(by=['cdc_reports_epi_week'])
epiweekIN0 = rubelIN0['cdc_reports_epi_week']
epiweekWA0 = rubelWA0['cdc_reports_epi_week']
rubelINval0 = rubelIN0['cdc_reports_total_cases']
rubelWAval0 = rubelWA0['cdc_reports_total_cases']
realweekI0 = np.empty([len(epiweekIN0)])
realweekI0[:] = epiweekIN0[:]-197000
realweekI0[51:] = realweekI0[51:]-48
realweekW0= np.empty([len(epiweekWA0)])
realweekW0[:] = epiweekWA0[:]-197000
realweekW0[51:] = realweekW0[51:]-48
rwI = np.empty([len(realweekI)+len(realweekI0)])
k = len(realweekI0)
rwI[0:k] = realweekI0[:]
rwI[k:] = realweekI[:]+104
rvI = np.empty([len(rubelINval)+len(rubelINval0)])
rvI[0:k] = rubelINval0
rvI[k:] = rubelINval
rwW = np.empty([len(realweekW)+len(realweekW0)])
k = len(realweekW0)
rwW[0:k] = realweekW0[:]
rwW[k:] = realweekW[:]+104
rvW = np.empty([len(rubelWAval)+len(rubelWAval0)])
rvW[0:k] = rubelWAval0
rvW[k:] = rubelWAval
from pylab import rcParams
rcParams['figure.figsize'] = 20, 5
with plot.style.context('fivethirtyeight'):
plot.plot(rwI, rvI, linewidth=2)
plot.plot(rwW, rvW, linewidth=2)