45
R for Everything Jared P. Lander

R for Everything

Embed Size (px)

Citation preview

Page 1: R for Everything

R for EverythingJared P. Lander

Page 3: R for Everything

3/45

Page 4: R for Everything

4/45

Page 5: R for Everything

Giants

5/45

Page 6: R for Everything

Compressed Data Online

6/45

Page 7: R for Everything

7/45

Page 8: R for Everything

Create Directory

# See if directory exists dir.exists('FootballTemp')

[1] FALSE

# create it dir.create('FootballTemp') # check again dir.exists('FootballTemp')

[1] TRUE

8/45

Page 9: R for Everything

9/45

Page 10: R for Everything

Download Files

download.file('http://www.jaredlander.com/data/Football1415.tar.gz',                destfile='FootballTemp/football.tar.gz',                method='curl')

10/45

Page 11: R for Everything

Untar

11/45

Page 12: R for Everything

getXKCD('1168')

12/45

Page 13: R for Everything

Untar the File

untar('FootballTemp/football.tar.gz', exdir='FootballFiles')

13/45

Page 14: R for Everything

Did They Extract?

dir('FootballFiles')

[1] "pbp‐2014.csv" "pbp‐2015.csv"

14/45

Page 15: R for Everything

Delete Tar

unlink('FootballTemp/football.tar.gz') dir('FootballTemp')

character(0)

15/45

Page 16: R for Everything

Inspect One File

file.info('FootballFiles/pbp‐2014.csv')

                               size isdir mode               mtime FootballFiles/pbp‐2014.csv 10280324 FALSE  666 2016‐03‐25 00:14:23                                          ctime               atime exe FootballFiles/pbp‐2014.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47  no

16/45

Page 17: R for Everything

Inspect All Files

dir('FootballFiles') %>% file.info

             size isdir mode mtime ctime atime  exe pbp‐2014.csv   NA    NA <NA>  <NA>  <NA>  <NA> <NA> pbp‐2015.csv   NA    NA <NA>  <NA>  <NA>  <NA> <NA>

17/45

Page 18: R for Everything

Inspect All Files

dir('FootballFiles', full.names=TRUE) %>% file.info

                               size isdir mode               mtime FootballFiles/pbp‐2014.csv 10280324 FALSE  666 2016‐03‐25 00:14:23 FootballFiles/pbp‐2015.csv 10671016 FALSE  666 2016‐03‐25 00:14:23                                          ctime               atime exe FootballFiles/pbp‐2014.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47  no FootballFiles/pbp‐2015.csv 2016‐04‐04 22:48:47 2016‐04‐04 22:48:47  no

18/45

Page 19: R for Everything

Better Names

file.rename(from=dir('FootballFiles', full.names=TRUE),              to=sprintf('FootballFiles/Football%s.csv', 14:15))

[1] TRUE TRUE

19/45

Page 20: R for Everything

Better Names

dir('FootballFiles')

[1] "Football14.csv" "Football15.csv"

20/45

Page 21: R for Everything

Make Copies

dir.create('FootballFiles/Backup') file.copy(dir('FootballFiles', full.names=TRUE, pattern='\\.csv'),            sprintf('FootballFiles/Backup/Footballl%s.csv', 14:15))

[1] TRUE TRUE

21/45

Page 22: R for Everything

Make Copies

dir('FootballFiles', recursive=TRUE)

[1] "Backup/Footballl14.csv" "Backup/Footballl15.csv" "Football14.csv"         [4] "Football15.csv"        

22/45

Page 23: R for Everything

Count Columns

count.fields('FootballFiles/Football14.csv', sep=',') %>% head(15)

 [1] 45 45 45 45 15 45 45 45 45 45 45 45 45 45 45

count.fields('FootballFiles/Football15.csv', sep=',') %>% head(15)

 [1] 45 45 45 45 45 45 45 45 45 NA 15 45 45 45 45

23/45

Page 24: R for Everything

Line Count

system('wc ‐l FootballFiles/Football14.csv')

45696 FootballFiles/Football14.csv

system('wc ‐l FootballFiles/Football15.csv')

46278 FootballFiles/Football15.csv

24/45

Page 25: R for Everything

Reference Files

dataPath <‐ 'FootballFiles' file.path(dataPath, 'Football14.csv')

[1] "FootballFiles/Football14.csv"

file.path(dataPath, 'Football15.csv')

[1] "FootballFiles/Football15.csv"

25/45

Page 26: R for Everything

Read Data

theFiles <‐ dir(dataPath, pattern='\\.csv', full.names=TRUE) games <‐ theFiles %>% map_df(read.csv2, sep=',', header=TRUE, stringsAsFactors=FALSE)

26/45

Page 27: R for Everything

See the Data

DT::datatable(data=games%>% slice(sample(nrow(games), size=500, replace=FALSE)),                rownames=FALSE,               options = list(                   dom = "rtiS",                   scrollY = 400, scrollX=TRUE,                   scrollCollapse = TRUE),               filter=list(position='top') )

27/45

Page 28: R for Everything

See the Data

Showing 1 to 10 of 500 entries

2016010300 2016-01-03 1 12 31 BUF NYJ

2015120608 2015-12-06 2 15 0 ATL TB

2015122100 2015-12-21 1 15 0 DET NO

2014111610 11/16/2014 1 6 6 DET ARI

2015112904 2015-11-29 2 1 23 IND TB

2015122710 2015-12-27 2 14 7 GB ARI

2014101203 10/12/2014 1 11 20 PIT CLE

2015101102 2015-10-11 2 8 2 CIN SEA

GameId GameDate Quarter Minute Second OffenseTeam DefenseTeam Down ToGo

All All All All All All All All All

28/45

Page 29: R for Everything

Pass vs Rush

29/45

Page 30: R for Everything

Focus on One Team's Offense

oneOff <‐ games %>%     filter(OffenseTeam == 'NYG', PlayType %in% c('PASS', 'RUSH')) %>%     mutate(PlayType=factor(PlayType, levels=c('RUSH', 'PASS')),             Down=factor(Down, levels=c(1, 2, 3, 4)))

30/45

Page 31: R for Everything

Probability of a Pass

passRushMod <‐ glm(PlayType ~ Down + ToGo ‐ 1, data=oneOff, family=binomial) coefplot(passRushMod, trans=arm::invlogit, title='Probability of Pass')

31/45

Page 32: R for Everything

Scenarios

# make grid of scenarios scenarios <‐ expand.grid(ToGo=1:15, Down=1:4) %>% as.tbl %>%      mutate(Down=factor(Down, levels=c(1, 2, 3, 4))) # make prediction based on model scenarioPredict <‐ predict(passRushMod,                             newdata=scenarios, type='response', se.fit=TRUE) # build confidence intervals scenarios <‐ scenarios %>% mutate(Prediction=scenarioPredict$fit,                                    Lower=Prediction ‐ 2*scenarioPredict$se.fit,                                   Upper=Prediction + 2*scenarioPredict$se.fit)

32/45

Page 33: R for Everything

Scenarios

ToGo Down Prediction Lower Upper

1 1 0.2754536 0.2135514 0.3373558

2 1 0.2959441 0.2371832 0.3547051

3 1 0.3172914 0.2621339 0.3724488

4 1 0.3394361 0.2882498 0.3906223

5 1 0.3623061 0.3153154 0.4092968

6 1 0.3858171 0.3430379 0.4285962

knitr::kable(head(scenarios))

33/45

Page 34: R for Everything

Probability of Pass

ggplot(scenarios, aes(x=ToGo)) + scale_y_continuous(label=scales::percent) +     geom_ribbon(aes(ymin=Lower, ymax=Upper), fill='lightgrey') +     geom_line(aes(y=Prediction)) + facet_wrap(~Down, nrow=2)

34/45

Page 35: R for Everything

Get Eli's Stats

eliPage <‐ read_html('http://www.pro‐football‐reference.com/players/M/MannEl00.htm')

eliStats <‐ eliPage %>% html_nodes("#passing") %>%      html_table(header=TRUE) %>% getElement(1) useful::topleft(eliStats, c=7, r=8)

   Year Age  Tm Pos No.  G GS 1  2004  23 NYG  qb  10  9  7 2  2005  24 NYG  QB  10 16 16 3  2006  25 NYG  QB  10 16 16 4  2007  26 NYG  QB  10 16 16 5 2008*  27 NYG  QB  10 16 16 6  2009  28 NYG  QB  10 16 16 7  2010  29 NYG  QB  10 16 16 8 2011*  30 NYG  QB  10 16 16

35/45

Page 36: R for Everything

36/45

Page 37: R for Everything

Save Them

dir.create('results') ggsave('results/EliPass.png') write.table(eliStats, file='results/eliStats.csv', sep=',', row.names=FALSE)

[1] TRUE

[1] TRUE

37/45

Page 38: R for Everything

38/45

Page 39: R for Everything

Commit Them

repo <‐ repository(getwd()) add(repo, file.path('results', c('eliPass.png', 'eliStats.csv'))) commit(repo, message='Tracking plot and csv') push(repo)

39/45

Page 40: R for Everything

40/45

Page 41: R for Everything

Email Them

footballResults <‐ mime(     To = "[email protected]",     From = "[email protected]",     Subject = "Eli Results",     body = "See the attached graph and data.") %>%      attach_file('results/EliPass.png') %>%      attach_file('results/eliStats.csv') send_message(footballResults)

41/45

Page 42: R for Everything

Things We've Done

Create Directories

Query Directories

Untar Files

Read XKCD

Delete Files

Get File Info

Move Files

Copy Files

Count Columns

Run System Commands

·

·

·

·

·

·

·

·

·

·

Build File Paths

Read Data

Munge Data

Fit a GLM

Make Predictions

Generate Plots

Save Files

Scrape a Website

Commit and Push to Git

Send an Email

·

·

·

·

·

·

·

·

·

·

42/45

Page 43: R for Everything

Jared P. Lander

Chief Data Scientist of Lander Analytics

Author of R for Everyone

Adjunct Professor at Columbia University

Organizer of New York Open Statistical Programming (The R) Meetup

Website: http://www.jaredlander.com

·

·

·

·

·

43/45