New York Times COVID-19 Database¶

Data Description¶

Repository and Download¶

The NY Times released their COVID-19 database to github. The submissionREADME.md warns because of limited testing the counts are under the real counts.

download_url="https://github.com/nytimes/covid-19-data/archive/master.zip"

import pandas as pd
import numpy as np
import datetime as dt
import requests,zipfile,os,re

data_dir="Data/NYT_Covid_19_Databases/"
down_file=data_dir+"NTY_Covid_19.zip"

response=requests.get(download_url)
with open(down_file,'wb') as f: f.write(response.content)
with zipfile.ZipFile(down_file) as zipper: zipper.extractall(data_dir)

def dirTree(rootDir='.'):
    for dirName, subdirList, fileList in os.walk(rootDir):
        print('Found directory: %s' % dirName)
        for fname in fileList:
            print('\t%s' % fname)

dirTree(data_dir)

Found directory: Data/NYT_Covid_19_Databases/
	NTY_Covid_19.zip
Found directory: Data/NYT_Covid_19_Databases/covid-19-data-master
	LICENSE
	README.md
	us-counties.csv
	us-states.csv
	us.csv

Description¶

Two csv files exist in the database.

counties_file="Data/NYT_Covid_19_Databases/covid-19-data-master/us-counties.csv"
states_file="Data/NYT_Covid_19_Databases/covid-19-data-master/us-states.csv"

Counties Description¶

counties=pd.read_csv(counties_file)
counties['LogCases']=np.log10(counties.cases+1)
counties['LogDeaths']=np.log10(counties.deaths+1)

counties[:3]

counties.describe()

sum((counties.county=='Collin')&(counties.state=='Texas'))

37

collin=counties[counties.county=='Collin'].copy()

dallas=counties[(counties.county=='Dallas')&(counties.state=='Texas')].copy()

westchester=counties[(counties.county=='Westchester')&
                     (counties.state=='New York')&
                     (counties.date>='2020-03-10')
                    ].copy()

p=collin.plot(x="date",y="LogCases",label="Collin")
dallas.plot(x='date',y='LogCases',label="Dallas",ax=p)
westchester.plot(x='date',y='LogCases',label="Westchester",ax=p);

p=collin.plot(x="date",y="cases",label="Collin")
dallas.plot(x='date',y='cases',label="Dallas",ax=p)
westchester.plot(x='date',y='cases',label="Westchester",ax=p);

States Descriptions¶

states=pd.read_csv(states_file)
states.date=pd.to_datetime(states.date)
states['LogCases']=np.log10(states.cases+1)
states['LogDeaths']=np.log10(states.deaths+1)

states.columns

Index(['date', 'state', 'fips', 'cases', 'deaths', 'LogCases', 'LogDeaths'], dtype='object')

texas=states[states.state=='Texas'].copy()
newyork=states[states.state=='New York'].copy()
california=states[states.state=='California'].copy()
florida=states[states.state=='Florida'].copy()
louisiana=states[states.state=='Louisiana'].copy()
virginia=states[states.state=='Virginia'].copy()
georgia=states[states.state=='Georgia'].copy()

p=texas.plot(x="date",y="LogCases",label="Texas")
newyork.plot(x='date',y='LogCases',label="New York",ax=p)
california.plot(x='date',y='LogCases',label="California",ax=p)
florida.plot(x='date',y='LogCases',label="Florida",ax=p)
louisiana.plot(x='date',y='LogCases',label="Louisiana",ax=p)
virginia.plot(x='date',y='LogCases',label="Virginia",ax=p)
georgia.plot(x='date',y='LogCases',label="Georgia",ax=p)

<matplotlib.axes._subplots.AxesSubplot at 0x7ff7a4a80eb8>

p=texas.plot(x="date",y="cases",label="Texas")
newyork.plot(x='date',y='cases',label="New York",ax=p)
california.plot(x='date',y='cases',label="California",ax=p)
florida.plot(x='date',y='cases',label="Florida",ax=p)
louisiana.plot(x='date',y='cases',label="Louisiana",ax=p)
virginia.plot(x='date',y='cases',label="Virginia",ax=p)
georgia.plot(x='date',y='cases',label="Georgia",ax=p)

<matplotlib.axes._subplots.AxesSubplot at 0x7ff7a49a5438>

Saving Data¶

import MySQLdb,sqlalchemy
import datetime as dt

Engine=sqlalchemy.create_engine('mysql://cov19:LionsTigersAndBears@localhost/CoV19')

stamp_time=dt.datetime.now().strftime("%Y%m%d%H%M%S")

states.to_sql("NYT_States_"+stamp_time,con=Engine)

counties.to_sql("NYT_Counties_"+stamp_time,con=Engine)

r=Engine.execute('select * from NYT_States_200403085047 limit 10')

Plotting¶

Here we take a look at using Bokeh.

from bokeh.plotting import figure, output_file,output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

output_notebook()

states.sample(5)

texas['days_since_3_deaths']=texas.date-texas[texas.deaths>=3].date.min()

astates=[pd.DataFrame(y) for x,y in states.groupby('state',as_index=False)]

p=figure(x_axis_type='datetime',)
for df in astates:
    source=ColumnDataSource(df)
    p.line(x='date',y='LogDeaths',name=df.state.iloc[0],
           line_alpha=0.2,hover_alpha=1.0,source=source,)
p.add_tools(HoverTool(tooltips=[('State','$name')]))
show(p)

p=figure(x_axis_type='datetime')
usa=states.groupby('date').sum().reset_index()
source=ColumnDataSource(usa)
p.line(x='date',y='deaths',line_alpha=1.0,source=source)
show(p)

	date	county	state	fips	cases	LogCases
0	2020-01-21	Snohomish	Washington	53061.0	1	0.30103
1	2020-01-22	Snohomish	Washington	53061.0	1	0.30103
2	2020-01-23	Snohomish	Washington	53061.0	1	0.30103

	fips	cases	deaths	LogCases	LogDeaths
count	58472.000000	59249.000000	59249.000000	59249.000000	59249.000000
mean	29578.745246	116.523570	3.727590	1.025774	0.165260
std	15535.248926	1457.200963	77.144754	0.707186	0.358554
min	1001.000000	0.000000	0.000000	0.000000	0.000000
25%	17169.000000	2.000000	0.000000	0.477121	0.000000
50%	28139.000000	6.000000	0.000000	0.845098	0.000000
75%	42129.000000	24.000000	1.000000	1.397940	0.301030
max	56043.000000	110465.000000	7690.000000	5.043229	3.885983

	date	state	fips	cases	deaths	LogCases	LogDeaths
345	2020-03-06	Tennessee	47	1	0	0.301030	0.000000
2080	2020-04-09	New Hampshire	33	819	21	2.913814	1.342423
2148	2020-04-10	Rhode Island	44	2015	49	3.304491	1.698970
27	2020-01-30	Washington	53	1	0	0.301030	0.000000
1539	2020-03-30	Pennsylvania	42	4156	48	3.618780	1.690196

Erhart Family

Gaussian

Data Sources