```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```
```{r, warning=FALSE, message=FALSE, echo = FALSE}
library(dplyr)
library(ggplot2)
library(gridExtra)
library(caret)
```
```{r, echo = FALSE}
nes96 <- faraway::nes96 %>%
mutate(
party = ifelse(
PID %in% c("strRep", "weakRep", "indRep"), "Rep",
ifelse(
PID %in% c("strDem", "weakDem", "indDem"), "Dem",
"Ind"
)
)
)
```
## Classification
Our response variable is a category for each observation, not a number.
## Example: can we predict party affiliation as a function of age?
We have data including the following variables for each of 944 participants in the 1996 American National Election Study:
* party affiliation ("Dem", "Ind", or "Rep"). This is our response variable, $y_i$
* age (range 19 to 91). This is our explanatory variable or feature, $x_i$
```{r}
nes96 %>%
select(age, party) %>%
head()
nrow(nes96)
```
Here are the counts of how many participants are in each party:
```{r}
nes96 %>%
count(party)
```
#### Train/test split
As with regression, we will evaluate model performance based on a test set.
```{r}
set.seed(88412)
train_inds <- caret::createDataPartition(nes96$party, p = 0.8)
train_nes96 <- nes96 %>% slice(train_inds[[1]])
test_nes96 <- nes96 %>% slice(-train_inds[[1]])
```
\newpage
#### Some plots
A scatter plot isn't that useful:
```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, y = party)) +
geom_point()
```
We can jitter the points, but still not that helpful:
```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, y = party, color = party)) +
geom_point(position = position_jitter(height = 0.1)) +
scale_color_manual(values = c("orange", "cornflowerblue", "mediumblue"))
```
\newpage
How about a histogram? `postion = "fill"` says that within each bin, we want the bars to add up to 100%.
```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, fill = party)) +
geom_histogram(position = "fill") +
scale_fill_manual(values = c("orange", "cornflowerblue", "mediumblue"))
```
All of these put our response on the vertical axis, which is the easiest way to think about the model. We could also do something like a density plot of the explanatory variable colored by the response:
```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, color = party)) +
geom_density() +
scale_color_manual(values = c("orange", "cornflowerblue", "mediumblue"))
```
\newpage
### R Code for K Nearest Neighbors for Classification
```{r}
# "train" the KNN model
# this code is exactly the same as the code to do KNN regression!
knn_fit <- train(
form = party ~ age,
data = train_nes96,
method = "knn",
preProcess = "scale",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(k = 100)
)
# to get estimated class membership probabilities, specify type = "prob" in the predict function
f_hats <- predict(knn_fit, newdata = test_nes96, type = "prob")
head(f_hats)
# to get the most likely class, leave out type or specify type = "raw" (the default)
# if the estimated class probability is the same for two classes, ties are broken at random
y_hats <- predict(knn_fit, newdata = test_nes96, type = "raw")
head(y_hats)
# classification error rate: what proportion of predicted parties are not equal to the observed party?
mean(y_hats != test_nes96$party)
# how does this compare to just predicting the most common class in the training set?
train_nes96 %>% count(party)
mean("Dem" != test_nes96$party)
```
Our model does slightly better than just guessing the most common party in the training set.
\newpage
Here's a way to plot class membership probabilities as functions of age. It's admittedly a little awkward.
```{r, fig.height = 3.5}
predict_knn_probs <- function(x, party) {
f_hats <- predict(knn_fit, newdata = data.frame(age = x), type = "prob")
f_hats[[party]]
}
ggplot(data = nes96, mapping = aes(x = age)) +
stat_function(fun = predict_knn_probs,
args = list(party = "Dem"),
mapping = aes(color = "Dem")) +
stat_function(fun = predict_knn_probs,
args = list(party = "Ind"),
mapping = aes(color = "Ind")) +
stat_function(fun = predict_knn_probs,
args = list(party = "Rep"),
mapping = aes(color = "Rep")) +
scale_color_manual("Party", values = c("orange", "cornflowerblue", "mediumblue")) +
ylim(0, 1)
```
\newpage
### Flexibility is determined by $k$
Here are plots of the estimated class probability functions for several values of $k$ (code suppressed):
```{r, echo = FALSE}
# "train" the KNN model
# this code is exactly the same as the code to do KNN regression!
get_knn_plot <- function(k) {
knn_fit <- train(
form = party ~ age,
data = train_nes96,
method = "knn",
preProcess = "scale",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(k = k)
)
# to get estimated class membership probabilities, specify type = "prob" in the predict function
f_hats <- predict(knn_fit, newdata = test_nes96, type = "prob")
head(f_hats)
# to get the most likely class, leave out type or specify type = "raw" (the default)
y_hats <- predict(knn_fit, newdata = test_nes96, type = "raw")
head(y_hats)
# here's a way to plot class membership probabilities as functions of age.
# it's admittedly a little awkward.
predict_knn_probs <- function(x, party) {
f_hats <- predict(knn_fit, newdata = data.frame(age = x), type = "prob")
f_hats[[party]]
}
return(
ggplot(data = nes96, mapping = aes(x = age)) +
stat_function(fun = predict_knn_probs,
args = list(party = "Dem"),
mapping = aes(color = "Dem")) +
stat_function(fun = predict_knn_probs,
args = list(party = "Ind"),
mapping = aes(color = "Ind")) +
stat_function(fun = predict_knn_probs,
args = list(party = "Rep"),
mapping = aes(color = "Rep")) +
scale_color_manual("Party", values = c("orange", "cornflowerblue", "mediumblue")) +
ylim(0, 1) +
ggtitle(paste0("k = ", k))
)
}
grid.arrange(
get_knn_plot(1),
get_knn_plot(5),
get_knn_plot(10),
get_knn_plot(50),
get_knn_plot(100),
get_knn_plot(250)
)
```
### Decision Boundaries
We won't explicitly calculate this for KNN, but it's nice to have in mind the concept of a *decision boundary*: the point at which the predicted value (class with highest estimated probability) changes. I've indicated the decision boundaries on the plot below for $k = 250$:
```{r, echo = FALSE, fig.height = 3}
# "train" the KNN model
# this code is exactly the same as the code to do KNN regression!
knn_fit <- train(
form = party ~ age,
data = train_nes96,
method = "knn",
preProcess = "scale",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(k = 250)
)
# to get estimated class membership probabilities, specify type = "prob" in the predict function
f_hats <- predict(knn_fit, newdata = data.frame(age = seq(from = 19, to = 91, length = 1001)), type = "prob")
# to get the most likely class, leave out type or specify type = "raw" (the default)
y_hats <- predict(knn_fit, newdata = data.frame(age = seq(from = 19, to = 91, length = 1001)), type = "raw")
# here's a way to plot class membership probabilities as functions of age.
# it's admittedly a little awkward.
predict_knn_probs <- function(x, party) {
f_hats <- predict(knn_fit, newdata = data.frame(age = x), type = "prob")
f_hats[[party]]
}
# change point indices
changepoint_inds <- (rle(apply(f_hats, 1, which.max))$lengths) %>% cumsum()
changepoint_inds <- changepoint_inds[seq_len(length(changepoint_inds) - 1)]
ggplot(data = nes96, mapping = aes(x = age)) +
stat_function(fun = predict_knn_probs,
args = list(party = "Dem"),
mapping = aes(color = "Dem"),
n = 1001) +
stat_function(fun = predict_knn_probs,
args = list(party = "Ind"),
mapping = aes(color = "Ind"),
n = 1001) +
stat_function(fun = predict_knn_probs,
args = list(party = "Rep"),
mapping = aes(color = "Rep"),
n = 1001) +
geom_vline(xintercept = seq(from = 19, to = 91, length = 1001)[changepoint_inds]) +
scale_color_manual("Party", values = c("orange", "cornflowerblue", "mediumblue")) +
ylim(0, 1)
```
Note that there are generally fewer decision boundaries as $k$ increases.
\newpage
### KNN with 2 features
Suppose we use two variables to predict party affiliation:
* `age` (range 19 to 91). This is our first explanatory variable or feature, $x_{i1}$
* `popul` (range 0 to 7300) population of respondent's location in 1000s of people. This is our second feature, $x_{i2}$
With 2 inputs, the estimated class probability functions would have to be visualized in 3 dimensions (age, popul, and estimated class probability).
Instead, it's easier to display the decision boundaries in the two-dimensional feature space of values of (age, popul).
The plots below show these for a range of values of $k$:
```{r, echo = FALSE, fig.height = 8}
get_knn_plot <- function(k) {
# "train" the KNN model
knn_fit <- train(
form = party ~ age + popul,
data = train_nes96,
method = "knn",
preProcess = "scale",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(k = k)
)
test_grid <- expand.grid(
age = seq(from = 19, to = 91, length = 201),
popul = seq(from = 19, to = 7300, length = 201)
)
# to get estimated class membership probabilities, specify type = "prob" in the predict function
f_hats <- predict(knn_fit, newdata = test_grid, type = "prob")
# to get the most likely class, leave out type or specify type = "raw" (the default)
y_hats <- predict(knn_fit, newdata = test_grid, type = "raw")
background_knn <- test_grid %>%
mutate(
est_party = y_hats
)
p_k <- ggplot() +
geom_raster(data = background_knn,
mapping = aes(x = age, y = popul, fill = est_party), alpha = 0.2) +
geom_point(data = train_nes96, mapping = aes(x = age, y = popul, color = party)) +
scale_color_manual("party", values = c("orange", "cornflowerblue", "mediumblue")) +
scale_fill_manual(values = c("orange", "cornflowerblue", "mediumblue")) +
ggtitle(paste0("KNN, k = ", k))
return(p_k)
}
plots_knn <- lapply(c(1, 5, 10, 50, 100, 250), get_knn_plot)
grid.arrange(plots_knn[[1]], plots_knn[[2]], plots_knn[[3]], plots_knn[[4]], plots_knn[[5]], plots_knn[[6]], ncol = 2)
```
Here's how you could make one of these plots:
```{r, fig.height = 2}
# "train" the KNN model
knn_fit <- train(
form = party ~ age + popul,
data = train_nes96,
method = "knn",
preProcess = "scale",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(k = 5)
)
# a grid of values for age and popul at which to get the estimated class.
# it's not a test data set in the sense that we don't have observations of party to go with these points,
# but we will treat it as a "test set" in the sense that we will obtain predictions at these points
test_grid <- expand.grid(
age = seq(from = 19, to = 91, length = 201),
popul = seq(from = 19, to = 7300, length = 201)
)
head(test_grid)
# use predict to find the estimated most likely class at each point in our grid
y_hats <- predict(knn_fit, newdata = test_grid, type = "raw")
# add the estimated types into the test_grid data frame
background_knn <- test_grid %>%
mutate(
est_party = y_hats
)
# make the plot. geom_raster does the shading in the background, alpha = 0.2 makes it transparent
ggplot() +
geom_raster(data = background_knn,
mapping = aes(x = age, y = popul, fill = est_party), alpha = 0.2) +
geom_point(data = train_nes96, mapping = aes(x = age, y = popul, color = party)) +
scale_color_manual("party", values = c("orange", "cornflowerblue", "mediumblue")) +
scale_fill_manual(values = c("orange", "cornflowerblue", "mediumblue")) +
ggtitle("KNN, k = 5")
```