Collaborators: moi, yo, myself
Required libraries:
library(tidyverse)
library(data.table)
Helper functions:
expit <- function(x) exp(x)/(exp(x) + 1)
acc <- function(y, yhat) mean(y == yhat)
Check students’ code for this one, as computers will behave differently.
readr
t0 <- Sys.time()
covid_tidy <- read_csv("../data/coronavirus.csv",
col_types = cols(Admin2 = col_character(),
Date = col_date('%m/%d/%Y')))
## Warning: 1049272 parsing failures.
## row col expected actual file
## 30835 Combined_Key 1/0/T/F/TRUE/FALSE Baca, Colorado, US '../data/coronavirus.csv'
## 30835 FIPS 1/0/T/F/TRUE/FALSE 08009 '../data/coronavirus.csv'
## 30836 Combined_Key 1/0/T/F/TRUE/FALSE Geary, Kansas, US '../data/coronavirus.csv'
## 30836 FIPS 1/0/T/F/TRUE/FALSE 20061 '../data/coronavirus.csv'
## 30837 Combined_Key 1/0/T/F/TRUE/FALSE Lincoln, Wyoming, US '../data/coronavirus.csv'
## ..... ............ .................. .................... .........................
## See problems(...) for more details.
t1 <- Sys.time()
cat('readr read_csv time for covid:', t1-t0, '\n')
## readr read_csv time for covid: 4.368342
data.table
t2 <- Sys.time()
covid_dt <- fread("../data/coronavirus.csv")
covid_dt$Date <- as.Date(covid_dt$Date, format = '%m/%d/%Y')
t3 <- Sys.time()
cat('data.table fread time for covid:', t3-t2, '\n')
## data.table fread time for covid: 1.327819
data.table 3-4x faster!
dplyr
t4 <- Sys.time()
covid_tidy %>% arrange(Date) -> covid_tidy
t5 <- Sys.time()
cat('dplyr arrange time for covid:', t5-t4, '\n')
## dplyr arrange time for covid: 0.481689
data.table
t6 <- Sys.time()
covid_dt %>% setorder('Date')
t7 <- Sys.time()
cat('data.table setorder time for covid:', t7-t6, '\n')
## data.table setorder time for covid: 0.1377501
data.table 3-4x faster!
dplyr
t8 <- Sys.time()
covid_tidy %>%
select(Date, Country_Region, Province_State, Admin2, Case_Type, Cases) %>%
spread(Case_Type, Cases) -> covid_tidy
head(covid_tidy)
## # A tibble: 6 x 6
## Date Country_Region Province_State Admin2 Confirmed Deaths
## <date> <chr> <chr> <chr> <dbl> <dbl>
## 1 2020-01-22 US Texas Kimble 0 0
## 2 2020-01-22 US Texas Bowie 0 0
## 3 2020-01-22 US Virginia Carroll 0 0
## 4 2020-01-22 US Maine Franklin 0 0
## 5 2020-01-22 US Illinois Richland 0 0
## 6 2020-01-22 US Utah Tooele 0 0
dim(covid_tidy)
## [1] 287820 6
t9 <- Sys.time()
cat('dplyr spread time for covid:', t9-t8, '\n')
## dplyr spread time for covid: 1.79919
data.table
t10 <- Sys.time()
covid_dt %>% dcast(Country_Region+Province_State+Admin2+Date~Case_Type,
value.var = "Cases") -> covid_dt
head(covid_dt)
## Country_Region Province_State Admin2 Date Confirmed Deaths
## 1: Afghanistan N/A 2020-01-22 0 0
## 2: Afghanistan N/A 2020-01-23 0 0
## 3: Afghanistan N/A 2020-01-24 0 0
## 4: Afghanistan N/A 2020-01-25 0 0
## 5: Afghanistan N/A 2020-01-26 0 0
## 6: Afghanistan N/A 2020-01-27 0 0
dim(covid_dt)
## [1] 287820 6
t11 <- Sys.time()
cat('data.table dcast time for covid:', t11-t10, '\n')
## data.table dcast time for covid: 0.510927
data.table 3x faster!
quora <- read_csv('../data/quoraSmall.csv')
## Parsed with column specification:
## cols(
## qid = col_character(),
## question_text = col_character(),
## target = col_double()
## )
n <- nrow(quora)
qs <- c('who', 'what', 'when', 'where', 'why', 'how')
percs <- sapply(qs, function(q) quora %>% filter(str_detect(question_text, str_c('(?i)', str_c('^', q)))) %>% nrow()/n)
otherPerc <- 1 - sum(percs)
qTypes <- c(percs, otherPerc)
qTypes
## who what when where why how
## 0.01702 0.31405 0.01062 0.01392 0.10457 0.18963 0.35019
quora %>%
filter(str_detect(question_text, '^.*\\.{1} {1}')) %>%
nrow()/n
## [1] 0.04553
quora %>%
filter(str_detect(question_text, '^.*\\.{1} {1}')) %>%
summarize(trollPerc = mean(target))
## # A tibble: 1 x 1
## trollPerc
## <dbl>
## 1 0.108
mean(quora$target)
## [1] 0.06147
Somewhat more likely. Perhaps a statement in a question can be meant to bias readers?
quora %>%
mutate(vac = str_detect(question_text, '(?i)vaccin'),
qq = str_detect(question_text, '\\?{2,}'),
per = str_detect(question_text, '^.*\\.{1} {1}')) ->
quora
trollMod <- glm(target ~ vac + qq + per, data = quora[1:90000,], family = 'binomial')
summary(trollMod)
##
## Call:
## glm(formula = target ~ vac + qq + per, family = "binomial", data = quora[1:90000,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5407 -0.3506 -0.3506 -0.3506 2.3748
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.75831 0.01441 -191.417 <2e-16 ***
## vacTRUE 0.25251 0.52313 0.483 0.629
## qqTRUE -7.80772 84.47666 -0.092 0.926
## perTRUE 0.65703 0.05236 12.549 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41768 on 89999 degrees of freedom
## Residual deviance: 41631 on 89996 degrees of freedom
## AIC: 41639
##
## Number of Fisher Scoring iterations: 9
f <- function(x) x > expit(coef(trollMod)[1])
preds <- predict(trollMod, quora[90001:n,], type = 'response')
acc(f(preds), quora[90001:n,'target'])
## [1] 0.903
We get 94% accuracy by just using a prediction of “not troll” every time (only 6% troll questions)–without using covariates or the questions themselves at all! Our prediction rule doesn’t do better in terms of accuracy because even though we may be identifying Troll questions at > 6% accuracy, we aren’t beating 94%! When the two outcome classes are this imbalanced, accuracy is usually unhelpful.