Collaborators: moi, yo, myself
Required libraries:
library(tidyverse)
Read the SAT data:
sat <- read.csv('../data/sat.csv')
colnames(sat)[1] <- 'sex'
colnames(sat)
## [1] "sex" "SATV" "SATM" "SATSum" "HSGPA" "FYGPA"
sat$sex <- sat$sex - 1 # 0/1 encoding
…and COVID-19 data:
covid <- read.csv("../data/coronavirus.csv")
colnames(covid)[1] <- 'Case_Type'
colnames(covid)
## [1] "Case_Type" "Cases" "Difference"
## [4] "Date" "Country_Region" "Province_State"
## [7] "Admin2" "Combined_Key" "FIPS"
## [10] "Lat" "Long" "Table_Names"
## [13] "Prep_Flow_Runtime"
covid$Date <- as.Date(covid$Date, format = '%m/%d/%Y')
…and voting data:
votes115 <- read_csv('../data/S115_votes.csv')
## Parsed with column specification:
## cols(
## congress = col_double(),
## chamber = col_character(),
## rollnumber = col_double(),
## icpsr = col_double(),
## cast_code = col_double(),
## prob = col_double()
## )
## Warning: 9 parsing failures.
## row col expected actual file
## 19043 prob a double N/A '../data/S115_votes.csv'
## 19056 prob a double N/A '../data/S115_votes.csv'
## 31946 prob a double N/A '../data/S115_votes.csv'
## 31964 prob a double N/A '../data/S115_votes.csv'
## 39749 prob a double N/A '../data/S115_votes.csv'
## ..... .... ........ ...... ........................
## See problems(...) for more details.
members115 <- read_csv('../data/S115_members.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## chamber = col_character(),
## state_abbrev = col_character(),
## bioname = col_character(),
## bioguide_id = col_character(),
## conditional = col_logical()
## )
## See spec(...) for full column specifications.
joint115 <- inner_join(votes115, members115,
by = 'icpsr') # link records on senator ID
joint115 %>% filter(!is.na(nominate_dim1), !is.na(nominate_dim2)) -> joint115
joint115 <- joint115 %>% filter(cast_code %in% c(1,6),
party_code %in% c(100,200))
joint115$party <- ifelse(joint115$party_code == 100, 'D', 'R')
joint115$yay <- (joint115$cast_code == 1)
Helper functions:
# e^x/(e^x+1) for input x
expit <- function(x) exp(x)/(exp(x) + 1)
# root mean squared error for true y and prediction yhat
rmse <- function(y,yhat) sqrt(mean((y-yhat)^2))
# mean absolute error for true y and prediction yhat
mae <- function(y,yhat) mean(abs(y-yhat))
# accuracy for predictions yhat and true binary outcome y
acc <- function(y, yhat) mean(y == yhat)
x <- rnorm(100)
x <- (x-mean(x))/sd(x)
out <- cbind(x, x+1, x+2)
apply(out, 1, sd)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
apply(out, 2, sd)
## x
## 1 1 1
write.csv(out, 'tricky.csv')
Setup
covid %>%
filter(Admin2 == 'San Francisco',
Case_Type == 'Confirmed',
Cases > 0,
Date < '2020-04-01',
Date >= '2020-03-01') %>%
select(Cases, Date) %>%
arrange(Date) ->
sf
covid %>%
filter(Admin2 == 'San Francisco',
Case_Type == 'Confirmed',
Cases > 0,
Date == '2020-04-01') %>%
select(Cases, Date) %>%
arrange(Date) ->
sfApr
Poisson regression
poisMod <- glm(Cases ~ Date, data = sf, family = 'poisson')
predict(poisMod, sfApr, type = 'response')
## 1
## 550.58
sfApr
## Cases Date
## 1 434 2020-04-01
Prediction’s much closer. This is because Poisson regression handles the nonlinearity of the log scale much better than linear regression with a log transform. Poisson is appropriate because it takes non-negative integer values, just like the number of COVID-19 cases does (among other reasons).
sum((1:1)^3)
## [1] 1
sum((1:2)^3)
## [1] 9
sum((1:3)^3)
## [1] 36
sum((1:4)^3)
## [1] 100
Aha, \(1^2,3^2,6^2,10^2\)! For full credit, you need something like this:
sumN3 <- function(n) choose(n+1,2)^2
sapply(1:4, sumN3)
## [1] 1 9 36 100
We’ll give [2pts] for this:
sumN3 <- function(n) sum(1:n)^2
sapply(1:4, sumN3)
## [1] 1 9 36 100
linMod <- lm(SATSum ~ HSGPA, data = sat)
inRange <- function(x) x <= 85 | x >= 115
sat$admit <- inRange(sat$SATSum)
logitMod <- glm(admit ~ poly(HSGPA,2), data = sat, family = 'binomial')
coef(linMod)
## (Intercept) HSGPA
## 66.98845 11.36317
coef(logitMod)
## (Intercept) poly(HSGPA, 2)1 poly(HSGPA, 2)2
## -0.7732423 7.7426233 11.5069453
An increase of 1 point in HS GPA is associated with an 11.4 point SAT score increase. No interpretation of logistic regression needed here.
nSAT <- dim(sat)[1]
randInd <- sample(nSAT)
train <- sat[randInd[1:400],]
validate <- sat[randInd[401:800],]
test <- sat[randInd[801:nSAT],]
linMods <- lapply(1:3, function(p) lm(SATSum ~ poly(HSGPA, p), data = sat))
lapply(linMods, function(mod) acc(validate$admit, inRange(predict(mod, validate))))
## [[1]]
## [1] 0.6875
##
## [[2]]
## [1] 0.6875
##
## [[3]]
## [1] 0.6875
lapply(linMods, function(mod) mean(inRange(predict(mod, validate))))
## [[1]]
## [1] 0
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 0
All equivalent accuracy, because they all predict “no admit” for most students
f1 <- function(x) x > 0.5
f2 <- function(x) x > 0.7
f3 <- function(x) x < 0.4
logitModTrain <- glm(admit ~ poly(HSGPA,2), data = train, family = 'binomial')
acc(validate$admit, f1(predict(logitModTrain, validate, type = 'response')))
## [1] 0.71
acc(validate$admit, f2(predict(logitModTrain, validate, type = 'response')))
## [1] 0.6875
acc(validate$admit, f3(predict(logitModTrain, validate, type = 'response')))
## [1] 0.31
The assignment wasn’t clear that you needed to train this model on the training data, so if students used the full data set for training we will award credit on this part only. \(f^1\) appears best, and is slightly better than the hacked linear models. Students will tend to get different answers due to randomness in train/test/validation split, so we should just check that the accuracy check is correctly coded and the written response makes sense.
acc(test$admit, f1(predict(logitModTrain, test, type = 'response')))
## [1] 0.685
acc(test$admit, inRange(predict(linMods[[1]], test)))
## [1] 0.69
Basically the same. We can’t measure predictive power on the same data we fit the model to, so we have to do a split. We actually need a 3-way split, because picking the best linear model or best \(f^i\) is another step of the model fitting and selection process, so we need a third set to evaluate the best of each type of model independently of the data used to train the models and select the best of each type.
dim(joint115)
## [1] 57124 29
stateMod <- glm(yay ~ state_abbrev, data = joint115, family = 'binomial')
coefs <- coef(stateMod)
senateProbsGLM <- expit(c(coefs[1], coefs[2:50]+coefs[1]))
senateProbsTab <- joint115 %>% group_by(state_abbrev) %>% summarize(yays = mean(yay))
senateProbsGLM
## (Intercept) state_abbrevAL state_abbrevAR state_abbrevAZ state_abbrevCA
## 0.8464193 0.8049209 0.8447987 0.7906977 0.4096802
## state_abbrevCO state_abbrevCT state_abbrevDE state_abbrevFL state_abbrevGA
## 0.7075630 0.4747899 0.5698372 0.7277580 0.8391111
## state_abbrevHI state_abbrevIA state_abbrevID state_abbrevIL state_abbrevIN
## 0.4351536 0.8412698 0.8392256 0.4610507 0.7731872
## state_abbrevKS state_abbrevKY state_abbrevLA state_abbrevMA state_abbrevMD
## 0.8454936 0.7780612 0.8470588 0.3419950 0.4877843
## state_abbrevME state_abbrevMI state_abbrevMN state_abbrevMO state_abbrevMS
## 0.8614357 0.4983137 0.5016949 0.7512864 0.8508239
## state_abbrevMT state_abbrevNC state_abbrevND state_abbrevNE state_abbrevNH
## 0.7168067 0.8517544 0.7810903 0.8341751 0.5472340
## state_abbrevNJ state_abbrevNM state_abbrevNV state_abbrevNY state_abbrevOH
## 0.3994465 0.4647766 0.6649396 0.3822297 0.6607893
## state_abbrevOK state_abbrevOR state_abbrevPA state_abbrevRI state_abbrevSC
## 0.8331922 0.3702451 0.6626712 0.4916107 0.8522040
## state_abbrevSD state_abbrevTN state_abbrevTX state_abbrevUT state_abbrevVA
## 0.8528175 0.8368237 0.8487973 0.8050633 0.5709459
## state_abbrevVT state_abbrevWA state_abbrevWI state_abbrevWV state_abbrevWY
## 0.5043029 0.4685908 0.6610597 0.7952822 0.8260870
senateProbsTab
## # A tibble: 50 x 2
## state_abbrev yays
## <chr> <dbl>
## 1 AK 0.846
## 2 AL 0.805
## 3 AR 0.845
## 4 AZ 0.791
## 5 CA 0.410
## 6 CO 0.708
## 7 CT 0.475
## 8 DE 0.570
## 9 FL 0.728
## 10 GA 0.839
## # ... with 40 more rows
nomMod <- glm(yay ~ nominate_dim1 + nominate_dim2,
data = joint115, family = 'binomial')
coef(nomMod)
## (Intercept) nominate_dim1 nominate_dim2
## 0.7165597 1.8357546 0.5905301
expit(c(1,-0.5,0)%*%coef(nomMod))
## [,1]
## [1,] 0.4498399
1 point increase in NOMINATE dimension 1 (strong swing rightwards in ideology) is associated with a 1.8 point increase in log-odds of voting “yay.” Taking the log of the odds is the same as taking the logit of p! Center-left senator’s estimated “yay” probability is 0.45.