Upload
work-bench
View
7.426
Download
2
Embed Size (px)
Citation preview
R for EverythingJared P. Lander
3/45
4/45
Compressed Data Online
6/45
7/45
Create Directory
# See if directory exists dir.exists('FootballTemp')
[1] FALSE
# create it dir.create('FootballTemp') # check again dir.exists('FootballTemp')
[1] TRUE
8/45
9/45
Download Files
download.file('http://www.jaredlander.com/data/Football1415.tar.gz', destfile='FootballTemp/football.tar.gz', method='curl')
10/45
Untar
11/45
Untar the File
untar('FootballTemp/football.tar.gz', exdir='FootballFiles')
13/45
Did They Extract?
dir('FootballFiles')
[1] "pbp‐2014.csv" "pbp‐2015.csv"
14/45
Delete Tar
unlink('FootballTemp/football.tar.gz') dir('FootballTemp')
character(0)
15/45
Inspect One File
file.info('FootballFiles/pbp‐2014.csv')
size isdir mode mtime FootballFiles/pbp‐2014.csv 10280324 FALSE 666 2016‐03‐25 00:14:23 ctime atime exe FootballFiles/pbp‐2014.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47 no
16/45
Inspect All Files
dir('FootballFiles') %>% file.info
size isdir mode mtime ctime atime exe pbp‐2014.csv NA NA <NA> <NA> <NA> <NA> <NA> pbp‐2015.csv NA NA <NA> <NA> <NA> <NA> <NA>
17/45
Inspect All Files
dir('FootballFiles', full.names=TRUE) %>% file.info
size isdir mode mtime FootballFiles/pbp‐2014.csv 10280324 FALSE 666 2016‐03‐25 00:14:23 FootballFiles/pbp‐2015.csv 10671016 FALSE 666 2016‐03‐25 00:14:23 ctime atime exe FootballFiles/pbp‐2014.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47 no FootballFiles/pbp‐2015.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47 no
18/45
Better Names
file.rename(from=dir('FootballFiles', full.names=TRUE), to=sprintf('FootballFiles/Football%s.csv', 14:15))
[1] TRUE TRUE
19/45
Better Names
dir('FootballFiles')
[1] "Football14.csv" "Football15.csv"
20/45
Make Copies
dir.create('FootballFiles/Backup') file.copy(dir('FootballFiles', full.names=TRUE, pattern='\\.csv'), sprintf('FootballFiles/Backup/Footballl%s.csv', 14:15))
[1] TRUE TRUE
21/45
Make Copies
dir('FootballFiles', recursive=TRUE)
[1] "Backup/Footballl14.csv" "Backup/Footballl15.csv" "Football14.csv" [4] "Football15.csv"
22/45
Count Columns
count.fields('FootballFiles/Football14.csv', sep=',') %>% head(15)
[1] 45 45 45 45 15 45 45 45 45 45 45 45 45 45 45
count.fields('FootballFiles/Football15.csv', sep=',') %>% head(15)
[1] 45 45 45 45 45 45 45 45 45 NA 15 45 45 45 45
23/45
Line Count
system('wc ‐l FootballFiles/Football14.csv')
45696 FootballFiles/Football14.csv
system('wc ‐l FootballFiles/Football15.csv')
46278 FootballFiles/Football15.csv
24/45
Reference Files
dataPath <‐ 'FootballFiles' file.path(dataPath, 'Football14.csv')
[1] "FootballFiles/Football14.csv"
file.path(dataPath, 'Football15.csv')
[1] "FootballFiles/Football15.csv"
25/45
Read Data
theFiles <‐ dir(dataPath, pattern='\\.csv', full.names=TRUE) games <‐ theFiles %>% map_df(read.csv2, sep=',', header=TRUE, stringsAsFactors=FALSE)
26/45
See the Data
DT::datatable(data=games%>% slice(sample(nrow(games), size=500, replace=FALSE)), rownames=FALSE, options = list( dom = "rtiS", scrollY = 400, scrollX=TRUE, scrollCollapse = TRUE), filter=list(position='top') )
27/45
See the Data
Showing 1 to 10 of 500 entries
2016010300 2016-01-03 1 12 31 BUF NYJ
2015120608 2015-12-06 2 15 0 ATL TB
2015122100 2015-12-21 1 15 0 DET NO
2014111610 11/16/2014 1 6 6 DET ARI
2015112904 2015-11-29 2 1 23 IND TB
2015122710 2015-12-27 2 14 7 GB ARI
2014101203 10/12/2014 1 11 20 PIT CLE
2015101102 2015-10-11 2 8 2 CIN SEA
GameId GameDate Quarter Minute Second OffenseTeam DefenseTeam Down ToGo
All All All All All All All All All
28/45
Pass vs Rush
29/45
Focus on One Team's Offense
oneOff <‐ games %>% filter(OffenseTeam == 'NYG', PlayType %in% c('PASS', 'RUSH')) %>% mutate(PlayType=factor(PlayType, levels=c('RUSH', 'PASS')), Down=factor(Down, levels=c(1, 2, 3, 4)))
30/45
Probability of a Pass
passRushMod <‐ glm(PlayType ~ Down + ToGo ‐ 1, data=oneOff, family=binomial) coefplot(passRushMod, trans=arm::invlogit, title='Probability of Pass')
31/45
Scenarios
# make grid of scenarios scenarios <‐ expand.grid(ToGo=1:15, Down=1:4) %>% as.tbl %>% mutate(Down=factor(Down, levels=c(1, 2, 3, 4))) # make prediction based on model scenarioPredict <‐ predict(passRushMod, newdata=scenarios, type='response', se.fit=TRUE) # build confidence intervals scenarios <‐ scenarios %>% mutate(Prediction=scenarioPredict$fit, Lower=Prediction ‐ 2*scenarioPredict$se.fit, Upper=Prediction + 2*scenarioPredict$se.fit)
32/45
Scenarios
ToGo Down Prediction Lower Upper
1 1 0.2754536 0.2135514 0.3373558
2 1 0.2959441 0.2371832 0.3547051
3 1 0.3172914 0.2621339 0.3724488
4 1 0.3394361 0.2882498 0.3906223
5 1 0.3623061 0.3153154 0.4092968
6 1 0.3858171 0.3430379 0.4285962
knitr::kable(head(scenarios))
33/45
Probability of Pass
ggplot(scenarios, aes(x=ToGo)) + scale_y_continuous(label=scales::percent) + geom_ribbon(aes(ymin=Lower, ymax=Upper), fill='lightgrey') + geom_line(aes(y=Prediction)) + facet_wrap(~Down, nrow=2)
34/45
Get Eli's Stats
eliPage <‐ read_html('http://www.pro‐football‐reference.com/players/M/MannEl00.htm')
eliStats <‐ eliPage %>% html_nodes("#passing") %>% html_table(header=TRUE) %>% getElement(1) useful::topleft(eliStats, c=7, r=8)
Year Age Tm Pos No. G GS 1 2004 23 NYG qb 10 9 7 2 2005 24 NYG QB 10 16 16 3 2006 25 NYG QB 10 16 16 4 2007 26 NYG QB 10 16 16 5 2008* 27 NYG QB 10 16 16 6 2009 28 NYG QB 10 16 16 7 2010 29 NYG QB 10 16 16 8 2011* 30 NYG QB 10 16 16
35/45
36/45
Save Them
dir.create('results') ggsave('results/EliPass.png') write.table(eliStats, file='results/eliStats.csv', sep=',', row.names=FALSE)
[1] TRUE
[1] TRUE
37/45
38/45
Commit Them
repo <‐ repository(getwd()) add(repo, file.path('results', c('eliPass.png', 'eliStats.csv'))) commit(repo, message='Tracking plot and csv') push(repo)
39/45
40/45
Email Them
footballResults <‐ mime( To = "[email protected]", From = "[email protected]", Subject = "Eli Results", body = "See the attached graph and data.") %>% attach_file('results/EliPass.png') %>% attach_file('results/eliStats.csv') send_message(footballResults)
41/45
Things We've Done
Create Directories
Query Directories
Untar Files
Read XKCD
Delete Files
Get File Info
Move Files
Copy Files
Count Columns
Run System Commands
·
·
·
·
·
·
·
·
·
·
Build File Paths
Read Data
Munge Data
Fit a GLM
Make Predictions
Generate Plots
Save Files
Scrape a Website
Commit and Push to Git
Send an Email
·
·
·
·
·
·
·
·
·
·
42/45
Jared P. Lander
Chief Data Scientist of Lander Analytics
Author of R for Everyone
Adjunct Professor at Columbia University
Organizer of New York Open Statistical Programming (The R) Meetup
Website: http://www.jaredlander.com
·
·
·
·
·
43/45
Packages
rvest
ggplot2
dplyr
purrr
coefplot
magritrr
useful
·
·
·
·
·
·
·
44/45
The Tools
R
RStudio
knitr
Pandoc
ioslides
·
·
·
·
·
45/45