```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
```

```{r, warning=FALSE, message=FALSE, echo = FALSE}
library(dplyr)
library(ggplot2)
library(gridExtra)
library(caret)
```

```{r, echo = FALSE}
nes96 <- faraway::nes96 %>%
  mutate(
    party = ifelse(
      PID %in% c("strRep", "weakRep", "indRep"), "Rep",
      ifelse(
        PID %in% c("strDem", "weakDem", "indDem"), "Dem",
        "Ind"
      )
    )
  )
```

## Classification

Our response variable is a category for each observation, not a number.

## Example: can we predict party affiliation as a function of age?

We have data including the following variables for each of 944 participants in the 1996 American National Election Study:

 * party affiliation ("Dem", "Ind", or "Rep").  This is our response variable, $y_i$
 * age (range 19 to 91).  This is our explanatory variable or feature, $x_i$

```{r}
nes96 %>%
  select(age, party) %>%
  head()

nrow(nes96)
```

Here are the counts of how many participants are in each party:

```{r}
nes96 %>%
  count(party)
```

#### Train/test split

As with regression, we will evaluate model performance based on a test set.

```{r}
set.seed(88412)
train_inds <- caret::createDataPartition(nes96$party, p = 0.8)
train_nes96 <- nes96 %>% slice(train_inds[[1]])
test_nes96 <- nes96 %>% slice(-train_inds[[1]])
```

\newpage

#### Some plots

A scatter plot isn't that useful:

```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, y = party)) +
  geom_point()
```

We can jitter the points, but still not that helpful:

```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, y = party, color = party)) +
  geom_point(position = position_jitter(height = 0.1)) +
  scale_color_manual(values = c("orange", "cornflowerblue", "mediumblue"))
```

\newpage

How about a histogram? `postion = "fill"` says that within each bin, we want the bars to add up to 100%.

```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, fill = party)) +
  geom_histogram(position = "fill") +
  scale_fill_manual(values = c("orange", "cornflowerblue", "mediumblue"))
```

All of these put our response on the vertical axis, which is the easiest way to think about the model.  We could also do something like a density plot of the explanatory variable colored by the response:

```{r, fig.height = 3.5}
ggplot(data = train_nes96, mapping = aes(x = age, color = party)) +
  geom_density() +
  scale_color_manual(values = c("orange", "cornflowerblue", "mediumblue"))
```

\newpage

### R Code for K Nearest Neighbors for Classification

```{r}
# "train" the KNN model
# this code is exactly the same as the code to do KNN regression!
knn_fit <- train(
  form = party ~ age,
  data = train_nes96,
  method = "knn",
  preProcess = "scale",
  trControl = trainControl(method = "none"),
  tuneGrid = data.frame(k = 100)
)

# to get estimated class membership probabilities, specify type = "prob" in the predict function
f_hats <- predict(knn_fit, newdata = test_nes96, type = "prob")
head(f_hats)

# to get the most likely class, leave out type or specify type = "raw" (the default)
# if the estimated class probability is the same for two classes, ties are broken at random
y_hats <- predict(knn_fit, newdata = test_nes96, type = "raw")
head(y_hats)

# classification error rate: what proportion of predicted parties are not equal to the observed party?
mean(y_hats != test_nes96$party)

# how does this compare to just predicting the most common class in the training set?
train_nes96 %>% count(party)
mean("Dem" != test_nes96$party)
```

Our model does slightly better than just guessing the most common party in the training set.

\newpage

Here's a way to plot class membership probabilities as functions of age.  It's admittedly a little awkward.

```{r, fig.height = 3.5}
predict_knn_probs <- function(x, party) {
  f_hats <- predict(knn_fit, newdata = data.frame(age = x), type = "prob")
  f_hats[[party]]
}

ggplot(data = nes96, mapping = aes(x = age)) +
  stat_function(fun = predict_knn_probs,
    args = list(party = "Dem"),
    mapping = aes(color = "Dem")) +
  stat_function(fun = predict_knn_probs,
    args = list(party = "Ind"),
    mapping = aes(color = "Ind")) +
  stat_function(fun = predict_knn_probs,
    args = list(party = "Rep"),
    mapping = aes(color = "Rep")) +
  scale_color_manual("Party", values = c("orange", "cornflowerblue", "mediumblue")) +
  ylim(0, 1)
```

\newpage

### Flexibility is determined by $k$

Here are plots of the estimated class probability functions for several values of $k$ (code suppressed):

```{r, echo = FALSE}
# "train" the KNN model
# this code is exactly the same as the code to do KNN regression!
get_knn_plot <- function(k) {
  knn_fit <- train(
    form = party ~ age,
    data = train_nes96,
    method = "knn",
    preProcess = "scale",
    trControl = trainControl(method = "none"),
    tuneGrid = data.frame(k = k)
  )
  
  # to get estimated class membership probabilities, specify type = "prob" in the predict function
  f_hats <- predict(knn_fit, newdata = test_nes96, type = "prob")
  head(f_hats)
  
  # to get the most likely class, leave out type or specify type = "raw" (the default)
  y_hats <- predict(knn_fit, newdata = test_nes96, type = "raw")
  head(y_hats)
  
  # here's a way to plot class membership probabilities as functions of age.
  # it's admittedly a little awkward.
  predict_knn_probs <- function(x, party) {
    f_hats <- predict(knn_fit, newdata = data.frame(age = x), type = "prob")
    f_hats[[party]]
  }
  
  return(
    ggplot(data = nes96, mapping = aes(x = age)) +
      stat_function(fun = predict_knn_probs,
        args = list(party = "Dem"),
        mapping = aes(color = "Dem")) +
      stat_function(fun = predict_knn_probs,
        args = list(party = "Ind"),
        mapping = aes(color = "Ind")) +
      stat_function(fun = predict_knn_probs,
        args = list(party = "Rep"),
        mapping = aes(color = "Rep")) +
      scale_color_manual("Party", values = c("orange", "cornflowerblue", "mediumblue")) +
      ylim(0, 1) +
      ggtitle(paste0("k = ", k))
  )
}

grid.arrange(
  get_knn_plot(1),
  get_knn_plot(5),
  get_knn_plot(10),
  get_knn_plot(50),
  get_knn_plot(100),
  get_knn_plot(250)
)
```

### Decision Boundaries

We won't explicitly calculate this for KNN, but it's nice to have in mind the concept of a *decision boundary*: the point at which the predicted value (class with highest estimated probability) changes.  I've indicated the decision boundaries on the plot below for $k = 250$:

```{r, echo = FALSE, fig.height = 3}
# "train" the KNN model
# this code is exactly the same as the code to do KNN regression!
knn_fit <- train(
  form = party ~ age,
  data = train_nes96,
  method = "knn",
  preProcess = "scale",
  trControl = trainControl(method = "none"),
  tuneGrid = data.frame(k = 250)
)

# to get estimated class membership probabilities, specify type = "prob" in the predict function
f_hats <- predict(knn_fit, newdata = data.frame(age = seq(from = 19, to = 91, length = 1001)), type = "prob")

# to get the most likely class, leave out type or specify type = "raw" (the default)
y_hats <- predict(knn_fit, newdata = data.frame(age = seq(from = 19, to = 91, length = 1001)), type = "raw")

# here's a way to plot class membership probabilities as functions of age.
# it's admittedly a little awkward.
predict_knn_probs <- function(x, party) {
  f_hats <- predict(knn_fit, newdata = data.frame(age = x), type = "prob")
  f_hats[[party]]
}

# change point indices
changepoint_inds <- (rle(apply(f_hats, 1, which.max))$lengths) %>% cumsum()
changepoint_inds <- changepoint_inds[seq_len(length(changepoint_inds) - 1)]

ggplot(data = nes96, mapping = aes(x = age)) +
  stat_function(fun = predict_knn_probs,
    args = list(party = "Dem"),
    mapping = aes(color = "Dem"),
    n = 1001) +
  stat_function(fun = predict_knn_probs,
    args = list(party = "Ind"),
    mapping = aes(color = "Ind"),
    n = 1001) +
  stat_function(fun = predict_knn_probs,
    args = list(party = "Rep"),
    mapping = aes(color = "Rep"),
    n = 1001) +
  geom_vline(xintercept = seq(from = 19, to = 91, length = 1001)[changepoint_inds]) +
  scale_color_manual("Party", values = c("orange", "cornflowerblue", "mediumblue")) +
  ylim(0, 1)
```

Note that there are generally fewer decision boundaries as $k$ increases.

\newpage

### KNN with 2 features

Suppose we use two variables to predict party affiliation:

 * `age` (range 19 to 91).  This is our first explanatory variable or feature, $x_{i1}$
 * `popul` (range 0 to 7300)  population of respondent's location in 1000s of people.  This is our second feature, $x_{i2}$

With 2 inputs, the estimated class probability functions would have to be visualized in 3 dimensions (age, popul, and estimated class probability).

Instead, it's easier to display the decision boundaries in the two-dimensional feature space of values of (age, popul).

The plots below show these for a range of values of $k$:

```{r, echo = FALSE, fig.height = 8}
get_knn_plot <- function(k) {
  # "train" the KNN model
  knn_fit <- train(
    form = party ~ age + popul,
    data = train_nes96,
    method = "knn",
    preProcess = "scale",
    trControl = trainControl(method = "none"),
    tuneGrid = data.frame(k = k)
  )
  
  test_grid <- expand.grid(
    age = seq(from = 19, to = 91, length = 201),
    popul = seq(from = 19, to = 7300, length = 201)
  )
  # to get estimated class membership probabilities, specify type = "prob" in the predict function
  f_hats <- predict(knn_fit, newdata = test_grid, type = "prob")

  # to get the most likely class, leave out type or specify type = "raw" (the default)
  y_hats <- predict(knn_fit, newdata = test_grid, type = "raw")

  background_knn <- test_grid %>%
    mutate(
      est_party = y_hats
    )
  
  p_k <- ggplot() +
    geom_raster(data = background_knn,
      mapping = aes(x = age, y = popul, fill = est_party), alpha = 0.2) +
    geom_point(data = train_nes96, mapping = aes(x = age, y = popul, color = party)) +
    scale_color_manual("party", values = c("orange", "cornflowerblue", "mediumblue")) +
    scale_fill_manual(values = c("orange", "cornflowerblue", "mediumblue")) +
    ggtitle(paste0("KNN, k = ", k))
  
  return(p_k)
}

plots_knn <- lapply(c(1, 5, 10, 50, 100, 250), get_knn_plot)

grid.arrange(plots_knn[[1]], plots_knn[[2]], plots_knn[[3]], plots_knn[[4]], plots_knn[[5]], plots_knn[[6]], ncol = 2)
```

Here's how you could make one of these plots:

```{r, fig.height = 2}
# "train" the KNN model
knn_fit <- train(
  form = party ~ age + popul,
  data = train_nes96,
  method = "knn",
  preProcess = "scale",
  trControl = trainControl(method = "none"),
  tuneGrid = data.frame(k = 5)
)

# a grid of values for age and popul at which to get the estimated class.
# it's not a test data set in the sense that we don't have observations of party to go with these points,
# but we will treat it as a "test set" in the sense that we will obtain predictions at these points
test_grid <- expand.grid(
  age = seq(from = 19, to = 91, length = 201),
  popul = seq(from = 19, to = 7300, length = 201)
)
head(test_grid)

# use predict to find the estimated most likely class at each point in our grid
y_hats <- predict(knn_fit, newdata = test_grid, type = "raw")

# add the estimated types into the test_grid data frame
background_knn <- test_grid %>%
  mutate(
    est_party = y_hats
  )

# make the plot.  geom_raster does the shading in the background, alpha = 0.2 makes it transparent
ggplot() +
  geom_raster(data = background_knn,
    mapping = aes(x = age, y = popul, fill = est_party), alpha = 0.2) +
  geom_point(data = train_nes96, mapping = aes(x = age, y = popul, color = party)) +
  scale_color_manual("party", values = c("orange", "cornflowerblue", "mediumblue")) +
  scale_fill_manual(values = c("orange", "cornflowerblue", "mediumblue")) +
  ggtitle("KNN, k = 5")
```