Collaborators: moi, yo, myself

Required libraries:

library(tidyverse)
library(data.table)

Helper functions:

expit <- function(x) exp(x)/(exp(x) + 1)
acc <- function(y, yhat) mean(y == yhat)

Question 1

Check students’ code for this one, as computers will behave differently.

Part a

readr

t0 <- Sys.time()
covid_tidy <- read_csv("../data/coronavirus.csv",
                       col_types = cols(Admin2 = col_character(),
                                        Date = col_date('%m/%d/%Y')))
## Warning: 1049272 parsing failures.
##   row          col           expected               actual                      file
## 30835 Combined_Key 1/0/T/F/TRUE/FALSE Baca, Colorado, US   '../data/coronavirus.csv'
## 30835 FIPS         1/0/T/F/TRUE/FALSE 08009                '../data/coronavirus.csv'
## 30836 Combined_Key 1/0/T/F/TRUE/FALSE Geary, Kansas, US    '../data/coronavirus.csv'
## 30836 FIPS         1/0/T/F/TRUE/FALSE 20061                '../data/coronavirus.csv'
## 30837 Combined_Key 1/0/T/F/TRUE/FALSE Lincoln, Wyoming, US '../data/coronavirus.csv'
## ..... ............ .................. .................... .........................
## See problems(...) for more details.
t1 <- Sys.time()
cat('readr read_csv time for covid:', t1-t0, '\n')
## readr read_csv time for covid: 4.368342

data.table

t2 <- Sys.time()
covid_dt <- fread("../data/coronavirus.csv")
covid_dt$Date <- as.Date(covid_dt$Date, format = '%m/%d/%Y')
t3 <- Sys.time()
cat('data.table fread time for covid:', t3-t2, '\n')
## data.table fread time for covid: 1.327819

data.table 3-4x faster!

Part b

dplyr

t4 <- Sys.time()
covid_tidy %>% arrange(Date) -> covid_tidy
t5 <- Sys.time()
cat('dplyr arrange time for covid:', t5-t4, '\n')
## dplyr arrange time for covid: 0.481689

data.table

t6 <- Sys.time()
covid_dt %>% setorder('Date')
t7 <- Sys.time()
cat('data.table setorder time for covid:', t7-t6, '\n')
## data.table setorder time for covid: 0.1377501

data.table 3-4x faster!

Part c

dplyr

t8 <- Sys.time()
covid_tidy %>%
  select(Date, Country_Region, Province_State, Admin2, Case_Type, Cases) %>%
  spread(Case_Type, Cases) -> covid_tidy
head(covid_tidy)
## # A tibble: 6 x 6
##   Date       Country_Region Province_State Admin2   Confirmed Deaths
##   <date>     <chr>          <chr>          <chr>        <dbl>  <dbl>
## 1 2020-01-22 US             Texas          Kimble           0      0
## 2 2020-01-22 US             Texas          Bowie            0      0
## 3 2020-01-22 US             Virginia       Carroll          0      0
## 4 2020-01-22 US             Maine          Franklin         0      0
## 5 2020-01-22 US             Illinois       Richland         0      0
## 6 2020-01-22 US             Utah           Tooele           0      0
dim(covid_tidy)
## [1] 287820      6
t9 <- Sys.time()
cat('dplyr spread time for covid:', t9-t8, '\n')
## dplyr spread time for covid: 1.79919

data.table

t10 <- Sys.time()
covid_dt %>% dcast(Country_Region+Province_State+Admin2+Date~Case_Type,
                   value.var = "Cases") -> covid_dt
head(covid_dt)
##    Country_Region Province_State Admin2       Date Confirmed Deaths
## 1:    Afghanistan            N/A        2020-01-22         0      0
## 2:    Afghanistan            N/A        2020-01-23         0      0
## 3:    Afghanistan            N/A        2020-01-24         0      0
## 4:    Afghanistan            N/A        2020-01-25         0      0
## 5:    Afghanistan            N/A        2020-01-26         0      0
## 6:    Afghanistan            N/A        2020-01-27         0      0
dim(covid_dt)
## [1] 287820      6
t11 <- Sys.time()
cat('data.table dcast time for covid:', t11-t10, '\n')
## data.table dcast time for covid: 0.510927

data.table 3x faster!

Question 2

quora <- read_csv('../data/quoraSmall.csv')
## Parsed with column specification:
## cols(
##   qid = col_character(),
##   question_text = col_character(),
##   target = col_double()
## )
n <- nrow(quora)

Part a

qs <- c('who', 'what', 'when', 'where', 'why', 'how')

percs <- sapply(qs, function(q) quora %>% filter(str_detect(question_text, str_c('(?i)', str_c('^', q)))) %>% nrow()/n)

otherPerc <- 1 - sum(percs)
qTypes <- c(percs, otherPerc)
qTypes
##     who    what    when   where     why     how         
## 0.01702 0.31405 0.01062 0.01392 0.10457 0.18963 0.35019

Part b

quora %>% 
  filter(str_detect(question_text, '^.*\\.{1} {1}')) %>%
  nrow()/n
## [1] 0.04553
quora %>% 
  filter(str_detect(question_text, '^.*\\.{1} {1}')) %>%
  summarize(trollPerc = mean(target))
## # A tibble: 1 x 1
##   trollPerc
##       <dbl>
## 1     0.108
mean(quora$target)
## [1] 0.06147

Somewhat more likely. Perhaps a statement in a question can be meant to bias readers?

Part c

quora %>% 
  mutate(vac = str_detect(question_text, '(?i)vaccin'),
         qq = str_detect(question_text, '\\?{2,}'),
         per = str_detect(question_text, '^.*\\.{1} {1}')) ->
  quora

trollMod <- glm(target ~ vac + qq + per, data = quora[1:90000,], family = 'binomial')
summary(trollMod)
## 
## Call:
## glm(formula = target ~ vac + qq + per, family = "binomial", data = quora[1:90000, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5407  -0.3506  -0.3506  -0.3506   2.3748  
## 
## Coefficients:
##             Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -2.75831    0.01441 -191.417   <2e-16 ***
## vacTRUE      0.25251    0.52313    0.483    0.629    
## qqTRUE      -7.80772   84.47666   -0.092    0.926    
## perTRUE      0.65703    0.05236   12.549   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41768  on 89999  degrees of freedom
## Residual deviance: 41631  on 89996  degrees of freedom
## AIC: 41639
## 
## Number of Fisher Scoring iterations: 9
f <- function(x) x > expit(coef(trollMod)[1])
preds <- predict(trollMod, quora[90001:n,], type = 'response')
acc(f(preds), quora[90001:n,'target'])
## [1] 0.903

We get 94% accuracy by just using a prediction of “not troll” every time (only 6% troll questions)–without using covariates or the questions themselves at all! Our prediction rule doesn’t do better in terms of accuracy because even though we may be identifying Troll questions at > 6% accuracy, we aren’t beating 94%! When the two outcome classes are this imbalanced, accuracy is usually unhelpful.