# # Install required packages (NB: they may already be installed on your machine)
# install.packages("tidyverse")
# install.packages("infer")
# install.packages("janitor")
# install.packages("repr")
# install.packages("stringr")
# install.packages("waffle", repos = "https://cinc.rud.is")
# install.packages("plyr")
# install.packages("ggthemes")
# install.packages("knitr")
# install.packages("kableExtra")
# install.packages("RColorBrewer")

# # Waffle is sometimes finicky, so you might also need this to run this notebook locally
# remotes::install_github("hrbrmstr/waffle")


              
                # Load required packages
library(plyr)
library(tidyverse)
library(infer)
library(janitor)
library(repr)
library(stringr)
library(waffle)
library(ggthemes)
library(knitr)
# library(kableExtra)
library(RColorBrewer)

-- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --

v ggplot2 3.3.3     v purrr   0.3.4
v tibble  3.1.0     v dplyr   1.0.4
v tidyr   1.1.2     v stringr 1.4.0
v readr   1.4.0     v forcats 0.5.1

-- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::arrange()   masks plyr::arrange()
x purrr::compact()   masks plyr::compact()
x dplyr::count()     masks plyr::count()
x dplyr::failwith()  masks plyr::failwith()
x dplyr::filter()    masks stats::filter()
x dplyr::id()        masks plyr::id()
x dplyr::lag()       masks stats::lag()
x dplyr::mutate()    masks plyr::mutate()
x dplyr::rename()    masks plyr::rename()
x dplyr::summarise() masks plyr::summarise()
x dplyr::summarize() masks plyr::summarize()


Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test


              
                # Custom ggplot theme for use throughout the analysis
# Idea from [Joey Stanley](https://joeystanley.com/blog/custom-themes-in-ggplot2)
theme_stat201 <- function (width = 12, height = 5) { 
    options(repr.plot.width = width, repr.plot.height = height)
    theme_bw(base_size = 14) %+replace% 
        theme(
            plot.title = element_text(hjust = 0.5) 
        )
}


              
                # A helper function to generate figure numbers
create_caption <- function (figure_number, title) {
    print(paste(sprintf("Figure #%d:", figure_number), title))
}
create_caption(0, "Hello, world!")

[1] "Figure #0: Hello, world!"


              
                # The seed used for the entirety of our analysis
SEED <- 12345


              
                # Download the file to wherever your Jupyter notebook is located
url <- "https://geodash.vpd.ca/opendata/crimedata_download/crimedata_csv_all_years.zip"
filename <- "crime_data.zip"
download.file(url, destfile = filename)

# Data comes in as a zip, so we'll need to extract it
unzip("crime_data.zip")

# Read in the desired file
crime_data <- read_csv("crimedata_csv_all_years.csv")

-- Column specification ------------------------------------------------------------------------------------------------
cols(
  TYPE = col_character(),
  YEAR = col_double(),
  MONTH = col_double(),
  DAY = col_double(),
  HOUR = col_double(),
  MINUTE = col_double(),
  HUNDRED_BLOCK = col_character(),
  NEIGHBOURHOOD = col_character(),
  X = col_double(),
  Y = col_double()
)


              
                # Inspect the first and last three rows of the data frame
head(crime_data, n = 3)
tail(crime_data, n = 3)
create_caption(3, "Exploring the initial VPD crime data set.")

[1] "Figure #3: Exploring the initial VPD crime data set."


              
                # 1. Generally, in R, you don't see ALL CAPS column names,
# so we can modify the column names manually (with the help)
# of the janitor library

oldw <- getOption("warn")
options(warn = -1) # Suppress "unable to translate '<U+00C4>' to native encoding" error messages

crime_data <- crime_data %>% clean_names()
head(crime_data)
create_caption(4, "Tidying the column names in the crime data.")

options(warn = oldw)

[1] "Figure #4: Tidying the column names in the crime data."


              
                # 2. Remove all NAs (Note: na.omit(...) applies to the entire data frame, 
# but neighbourhood is used as an example)

total_rows <- crime_data %>% nrow()
print(sprintf("There are %d rows in the data frame", total_rows))

na_neighbourhoods <- sum(is.na(crime_data$neighbourhood))
print(sprintf("Originally, there were %d NA values in the neighbourhood column", na_neighbourhoods))

crime_data <- na.omit(crime_data)

na_neighbourhoods <- sum(is.na(crime_data$neighbourhood))
print(sprintf("Now, there are %d NA values in the neighbourhood column", na_neighbourhoods))

[1] "There are 668167 rows in the data frame"
[1] "Originally, there were 70135 NA values in the neighbourhood column"
[1] "Now, there are 0 NA values in the neighbourhood column"


              
                # 3. Select only type, year, neighbourhood
# Filter to the date range of 2017-2020
crime_data <- crime_data %>% 
    select(type, year, neighbourhood) %>%
    filter(2017 <= year & year <= 2020)

head(crime_data)
create_caption(5, "Selecting and filtering our crime data.")

[1] "Figure #5: Selecting and filtering our crime data."


              
                # 4. Now, we'll convert `neighbourhood` and `type` to a factor, 
# since we're considering it categorical data
crime_data <- crime_data %>%
    mutate(type = as_factor(type)) %>%
    mutate(neighbourhood = as_factor(neighbourhood))

head(crime_data)
create_caption(6, "Converting crime data's column data types.")

[1] "Figure #6: Converting crime data's column data types."


              
                # Grab all unique crime types, and print them in a pretty way
crime_types <- unique(crime_data$type)
print(crime_types)

[1] Break and Enter Commercial                            
[2] Break and Enter Residential/Other                     
[3] Mischief                                              
[4] Other Theft                                           
[5] Theft from Vehicle                                    
[6] Theft of Bicycle                                      
[7] Theft of Vehicle                                      
[8] Vehicle Collision or Pedestrian Struck (with Fatality)
[9] Vehicle Collision or Pedestrian Struck (with Injury)  
9 Levels: Break and Enter Commercial ... Vehicle Collision or Pedestrian Struck (with Injury)


              
                theft_crimes <- c("Other Theft", "Theft from Vehicle", 
                  "Theft of Bicycle", "Theft of Vehicle")


              
                # Grab all unique crime types, and print them in a pretty way
neighbourhoods <- unique(crime_data$neighbourhood)
print(neighbourhoods)
print(sprintf("There are %d unique neighbourhoods available in this data set.", length(neighbourhoods)))

 [1] Fairview                  West End                 
 [3] Central Business District Hastings-Sunrise         
 [5] Strathcona                Grandview-Woodland       
 [7] Mount Pleasant            Sunset                   
 [9] Kensington-Cedar Cottage  Stanley Park             
[11] Shaughnessy               Marpole                  
[13] Oakridge                  Kitsilano                
[15] Victoria-Fraserview       Kerrisdale               
[17] West Point Grey           Riley Park               
[19] Arbutus Ridge             Renfrew-Collingwood      
[21] Killarney                 South Cambie             
[23] Dunbar-Southlands         Musqueam                 
24 Levels: Fairview West End Central Business District ... Musqueam
[1] "There are 24 unique neighbourhoods available in this data set."


              
                # Explore data set as it currently stands after some tidying
head(crime_data, n = 3)
tail(crime_data, n = 3)
create_caption(7, "Exploring our tidied crime data once again.")

[1] "Figure #7: Exploring our tidied crime data once again."


              
                # To produce our line plot, first, we'll need to compute the total amount of crime
# by neighborhood and year
neighbourhood_year_counts <- crime_data %>%
    group_by(neighbourhood, year) %>% 
    summarise(n = n())

head(neighbourhood_year_counts)
create_caption(8, "A tablet of counts of crime by neighborhood and year.")

`summarise()` has grouped output by 'neighbourhood'. You can override using the `.groups` argument.

[1] "Figure #8: A tablet of counts of crime by neighborhood and year."


              
                # 1. Finally, let's plot our data
crime_data_plot <- neighbourhood_year_counts %>%
    ggplot() +
    geom_line(aes(x = year, y = n, color = neighbourhood)) +
    scale_x_continuous(limits = c(2017, 2020)) +
    labs(x = "Year", y = "Number of Reported Crimes", color = "Neighbourhood") +
    ggtitle("Number of Reported Crimes Per Year in Selected Vancouver Neighbourhoods\n") +
    theme_stat201(18, 12)
crime_data_plot
create_caption(9, "A line plot depicting the number of total police-reported crimes over time by neighbourhood.")

[1] "Figure #9: A line plot depicting the number of total police-reported crimes over time by neighbourhood."


              
                # 2. Note that the stringr::str_wrap call wraps the long label names for crime types, and
# reorder places the largest crime count on the bottom

# We need especially distinct colors here, since we have many neighbourhoods; by default, R makes a gradient that "blends" the neighborhoods together
neighbourhoods <- 24
colors <- colorRampPalette(brewer.pal(8, "Set1"))(neighbourhoods)

crime_type_bar_plot <- crime_data %>%
    ggplot(aes(x = reorder(str_wrap(type, 30), type, function(x) - length(x)), fill = neighbourhood)) +
    geom_bar(position = "stack") +
    labs(x = "Crime Type", y = "Total Number of Crimes Committed of Type", fill = "Neighbourhood") +
    ggtitle("Total Number of Crimes Committed in Select Vancouver\nNeighbourhoods by Type (2017 to 2020)\n") +
    coord_flip() +
    theme_stat201(18, 12) + 
    scale_fill_manual(values = colors)

# Vehicle collisions that result in fatality are (thankfully) very low,
# and almost distinguishable on the plot below                           
crime_type_bar_plot
create_caption(10, "A stacked bar chart depicting the total number of crimes by type and neighbourhood.")

[1] "Figure #10: A stacked bar chart depicting the total number of crimes by type and neighbourhood."


              
                # Let's produce a count of the crime by year and type
crime_waffle <- crime_data %>%
    count(year, type) %>%
    mutate(n = n / 30)
head(crime_waffle)
create_caption(11, "A table showing proportions of types of crimes.")
# The geom_waffle() function will do some magic for us in terms of rounding

[1] "Figure #11: A table showing proportions of types of crimes."


              
                # 3. Let's look at the proportion of each crime committed over the years
waffle_plot <- crime_waffle %>%
    ggplot(aes(values = n, fill = type)) +
    geom_waffle(n_rows = 40, flip = TRUE, color = "white", size = 0.25) +
    facet_wrap(~year, nrow = 2, strip.position = "bottom") +
    scale_x_discrete() + 
    scale_y_continuous(labels = function(x) x * 40, # make this multiplier the same as n_rows
                       expand = c(0,0)) +
    labs(
        title = "Waffle Bar Chart for Crime\nType Data From 2017-2020\n",
        # subtitle = "A single square represents 30 crimes",
        x = "Year",
        y = "Count",
        fill = "Crime Type"
    ) +
    coord_equal() +
    theme_enhance_waffle() +
    theme(legend.position = "bottom") +
    theme_stat201(18, 12)
waffle_plot
create_caption(12, "A waffle plot depicting the proportion of each crime time by year.")

[1] "Figure #12: A waffle plot depicting the proportion of each crime time by year."


              
                # Let's first compute the pre-COVID mean in each neighbourhood (merging 2017 to 2019)
before_pandemic_df_by_year <- crime_data %>%
    mutate(is_theft = as.numeric(type %in% theft_crimes)) %>%
    filter(year >= 2017 & year <= 2019) %>%
    group_by(neighbourhood, year) %>% # group by neighbourhood and year first!
    summarize(prop_theft = sum(is_theft) / n()) # compute theft crimes in neighbourhood / crimes in neighborhood

# View prop_theft by year (2017, 2018, 2019) and neighbourhood
# head(p_hat_pre_covid_by_year)

before_pandemic_df <- before_pandemic_df_by_year %>%
    group_by(neighbourhood) %>%
    summarize(prop_theft = mean(prop_theft)) %>%
    mutate(time = "Before pandemic") %>%
    mutate(time = as_factor(time))
head(before_pandemic_df)
create_caption(13, "A table of the proportions of thefts by period of time.")

`summarise()` has grouped output by 'neighbourhood'. You can override using the `.groups` argument.

[1] "Figure #13: A table of the proportions of thefts by period of time."


              
                # Now let's get an estimate for the COVID mean by neighbourhood
during_pandemic_df <- crime_data %>%
    mutate(is_theft = as.numeric(type %in% theft_crimes)) %>%
    filter(year == 2020) %>%
    group_by(neighbourhood) %>%
    summarize(prop_theft = sum(is_theft) / n()) %>%
    mutate(time = "During pandemic") %>%
    mutate(time = as_factor(time))
head(during_pandemic_df)
create_caption(14, "Another table of the proportions of thefts by period of time.")

[1] "Figure #14: Another table of the proportions of thefts by period of time."


              
                # Summarize in table
prop_diff_df <- during_pandemic_df %>%
    rename(during_prop_theft = prop_theft) %>%
    select(during_prop_theft) %>%
    cbind(before_pandemic_df) %>%
    mutate(prop_diff = during_prop_theft - prop_theft) %>%
    select(neighbourhood, prop_diff)

prop_diff_df
create_caption(15, "A table capturing the differences in the proportion of theft-related crime over time.")

[1] "Figure #15: A table capturing the differences in the proportion of theft-related crime over time."


              
                prop_diff_df_summary <- prop_diff_df %>%
    summarize(decreased = sum(prop_diff < 0),
              increased = sum(prop_diff > 0))

prop_diff_df_summary
create_caption(16, "A table showing the number of neighborhoods that had increases and decreases in theft-related crime.")

print(sprintf("%s%% neighbourhoods saw an increase in crime, while the other %s%% neighbourhoods saw a decrease in crime.", 
              format(round(prop_diff_df_summary$increased / nrow(prop_diff_df) * 100, 2), nsmall = 2), 
              format(round(prop_diff_df_summary$decreased / nrow(prop_diff_df) * 100, 2), nsmall = 2)))

[1] "Figure #16: A table showing the number of neighborhoods that had increases and decreases in theft-related crime."
[1] "20.83% neighbourhoods saw an increase in crime, while the other 79.17% neighbourhoods saw a decrease in crime."


              
                # The goal is the wrangling that follows is to add two columns to our data frame:
#   a) `is_theft`, whether or not a crime is theft-related
#   b) `period`, when the crime occured ("Before pandemic", "During pandemic")

crime_data_is_theft <- crime_data %>%
    mutate(is_theft = type %in% theft_crimes) %>% # check if in the list of crimes whitelisted as theft-related
    select(neighbourhood, year, is_theft) # select relevant columns

head(crime_data_is_theft, n = 3) # we've now successfully completed step (a)
tail(crime_data_is_theft, n = 3) 
create_caption(17, "Exploring further tidied crime data.")

[1] "Figure #17: Exploring further tidied crime data."


              
                # As discussed, let's now add the `period` column

crime_data_processed <- crime_data_is_theft %>% # reuse dataframe from step (a)
    filter(year >= 2017 & year <= 2020) %>% # filter to our target years
    mutate(period = ifelse(year < 2020, 'Before pandemic', 'During pandemic')) %>% # since we've filtered already, we only need to check year < 2020 to know if it's pre-pandemic (or not)
    mutate(period = as.factor(period)) # convert to factor

head(crime_data_processed, n = 3) # we've now successfully completed step (b)
tail(crime_data_processed, n = 3)
create_caption(18, "Exploring even further tidied crime data.")

[1] "Figure #18: Exploring even further tidied crime data."


              
                neighbourhoods <- unique(crime_data$neighbourhood) # get all unique neighborhoods
print(neighbourhoods)

 [1] Fairview                  West End                 
 [3] Central Business District Hastings-Sunrise         
 [5] Strathcona                Grandview-Woodland       
 [7] Mount Pleasant            Sunset                   
 [9] Kensington-Cedar Cottage  Stanley Park             
[11] Shaughnessy               Marpole                  
[13] Oakridge                  Kitsilano                
[15] Victoria-Fraserview       Kerrisdale               
[17] West Point Grey           Riley Park               
[19] Arbutus Ridge             Renfrew-Collingwood      
[21] Killarney                 South Cambie             
[23] Dunbar-Southlands         Musqueam                 
24 Levels: Fairview West End Central Business District ... Musqueam


              
                # Choose as neighbourhood to use 
n <- "Dunbar-Southlands"

# 1. Filter data down to just this neighbourhood
neighbourhood_data <- crime_data_processed %>% 
    filter(neighbourhood == n)

# 2., 3., 4. done with a "shortcut" using infer
obs_diff_in_props <- neighbourhood_data %>%
    specify(is_theft ~ period, success = "TRUE") %>% # specify our response variable
    calculate(stat = "diff in props", order = c("During pandemic", "Before pandemic")) # compute the "diff in props" for each

# 5. Conduct the infer workflow
neighbourhood_infer <- neighbourhood_data %>%
    specify(is_theft ~ period, success = "TRUE") %>% # specify our response variable
    hypothesise(null = "independence") %>% # i.e., that the change in proprotion is 0
    generate(reps = 1000, type = "permute") %>% # generate 1,000 bootstrap samples for our null model
    calculate(stat = "diff in props", order = c("During pandemic", "Before pandemic")) # compute the "diff in props" for each

# 6. (Skipped! We don't need the P-value here yet.)
# 7. (Skipped! We don't need to append to anything just yet.)

neighbourhood_infer_vis <- neighbourhood_infer %>% 
   visualize() + 
   shade_p_value(obs_stat = obs_diff_in_props, direction = "two_sided") +
   xlab("Difference in Proportion of Theft-Related Crime") + 
   theme_stat201(10, 6)

neighbourhood_infer_vis
create_caption(19, "Visualizing the p-value on a single null distribution for Dunbar-Southlands.")
print(sprintf("Our p-value is %f", get_p_value(neighbourhood_infer, obs_diff_in_props, direction = "two_sided")))

[1] "Figure #19: Visualizing the p-value on a single null distribution for Dunbar-Southlands."
[1] "Our p-value is 0.080000"


              
                set.seed(SEED) # set our seed to the global seed for this notebook

# Suppress p-value = 0 warnings; to reduce this, we could opt for more repetitions
# bu that is not feasible given the number of neighbourhoods (24) we're targeting! 
oldw <- getOption("warn")
options(warn = -1) # Suppress "unable to translate '<U+00C4>' to native encoding" error messages

# Initialize our empty data frame with our desired columns; 
# see the Markdown table above for a description of the columns
columns <- c("neighbourhood", "past_prop", "current_prop", "diff_in_props", "p_value")
neighbourhoods_bootstrap_result <- data.frame(matrix(ncol = length(columns), nrow=0))
names(neighbourhoods_bootstrap_result) <- columns # assign column names

# For every neighbourhood
for (n in neighbourhoods) {
    # 1. Filter data down to just this neighbourhood
    neighbourhood_data <- crime_data_processed %>% 
        filter(neighbourhood == n)
    
    # 2. Create two data frames to separate pre-pandemic and during-pandemic crime
    pre_covid <- neighbourhood_data %>%
        filter(period == "Before pandemic")
    covid <- neighbourhood_data %>%
        filter(period  == "During pandemic")
    
    # 3. Compute proportions of theft-related crime within each data frame, using `mean`
    pre_covid_prop <- mean(as.numeric(pre_covid$is_theft))
    covid_prop <- mean(as.numeric(covid$is_theft))
    
    # 4. Compute our test statistic, the difference in proportions for this neighbourhood
    obs_diff_in_props <- covid_prop - pre_covid_prop
    
    # 5. Conduct the infer workflow
    neighbourhood_infer <- neighbourhood_data %>%
        specify(is_theft ~ period, success = "TRUE") %>% # specify our response variable
        hypothesise(null = "independence") %>% # i.e., that the change in proprotion is 0
        generate(reps = 1000, type = "permute") %>% # generate 1,000 bootstrap samples for our null model
        calculate(stat = "diff in props", order = c("During pandemic", "Before pandemic")) # compute the "diff in props" for each
    
    # 6. Compute the p-value, using our test statistic for this neighbourhood
    # NOTE: the test is two-tailed!
    p_value <- neighbourhood_infer %>% get_p_value(obs_diff_in_props, direction = "two_sided") %>% pull()
    
    # 7. Format and append our observed result to our `neighbourhoods_bootstrap_result` data frame
    new_obs <- data.frame(n, pre_covid_prop, covid_prop, obs_diff_in_props, p_value)
    names(new_obs) <- columns
    
    # NOTE: this is line that actually does the "appending"; rbind binds rows by common columns
    neighbourhoods_bootstrap_result <- rbind(neighbourhoods_bootstrap_result, new_obs)
}

# Reset warning messages
options(warn = oldw)

# Explore our results in `neighbourhoods_bootstrap_result`
head(neighbourhoods_bootstrap_result, n = 3)
tail(neighbourhoods_bootstrap_result, n = 3)
create_caption(20, "Exploring out bootstrapped result.")

[1] "Figure #20: Exploring out bootstrapped result."


              
                # Add on whether or not we should reject the P-value
neighbourhoods_bootstrap_result <- neighbourhoods_bootstrap_result %>% 
    mutate(reject5 = p_value < 0.05)

neighbourhoods_bootstrap_result
create_caption(21, "Exploring out bootstrapped result with interpretations of the P-value")

[1] "Figure #21: Exploring out bootstrapped result with interpretations of the P-value"


              
                # Compute summary statistics for our bootstrap result
neighbourhoods_bootstrap_result_summary <- neighbourhoods_bootstrap_result %>%
    summarize(rejected = sum(reject5),
              non_rejected = sum(!reject5),
              n = n())

neighbourhoods_bootstrap_result_summary
create_caption(22, "A table showing the number of rejected and non-rejected null hypotheses.")

print(sprintf("%s%% neighbourhoods did see a statistically significant change in the proportion of theft-related crime, while the other %s%% neighbourhoods did not.", 
              format(round(neighbourhoods_bootstrap_result_summary$rejected / neighbourhoods_bootstrap_result_summary$n * 100, 2), nsmall = 2), 
              format(round(neighbourhoods_bootstrap_result_summary$non_rejected / neighbourhoods_bootstrap_result_summary$n * 100, 2), nsmall = 2)))

[1] "Figure #22: A table showing the number of rejected and non-rejected null hypotheses."
[1] "45.83% neighbourhoods did see a statistically significant change in the proportion of theft-related crime, while the other 54.17% neighbourhoods did not."


              
                # Choose as neighbourhood to use 
n <- "Dunbar-Southlands"

# 1. Filter data down to just this neighbourhood
neighbourhood_data <- crime_data_processed %>% 
    filter(neighbourhood == n)

# Generate our bootstrap sample
neighbourhood_bootstrapped <- neighbourhood_data %>% 
    specify(is_theft ~ period, success = "TRUE") %>% # specify our response variable
    generate(reps = 1000, type = "bootstrap") %>% # generate 1,000 bootstrap samples for our null model
    calculate(stat = "diff in props", order = c("During pandemic", "Before pandemic")) # compute the "diff in props" for each

# 5. Conduct the infer workflow
neighbourhood_ci_95 <- neighbourhood_bootstrapped %>% 
    get_ci(level = 0.95, type = "percentile")

head(neighbourhood_ci_95)
create_caption(23, "The confidence interval, using bootstrapping, for Dunbar-Southlands")

[1] "Figure #23: The confidence interval, using bootstrapping, for Dunbar-Southlands"


              
                columns <- c("neighbourhood", "lower_ci", "upper_ci")
neighborhoods_bootstrap_cis <- data.frame(matrix(ncol = length(columns), nrow=0))
names(neighborhoods_bootstrap_cis) <- columns # assign column names

for (n in neighbourhoods) {
    # 1. Filter data down to just this neighbourhood
    neighbourhood_data <- crime_data_processed %>% 
        filter(neighbourhood == n)

    # 2. Generate our bootstrap sample
    neighbourhood_bootstrapped <- neighbourhood_data %>% 
        specify(is_theft ~ period, success = "TRUE") %>% # specify our response variable
        generate(reps = 1000, type = "bootstrap") %>% # generate 1,000 bootstrap samples for our null model
        calculate(stat = "diff in props", order = c("During pandemic", "Before pandemic")) # compute the "diff in props" for each

    # 3. Get our confidence interval
    neighbourhood_ci_95 <- neighbourhood_bootstrapped %>% 
        get_ci(level = 0.95, type = "percentile")
    
    # 4. Format and append our observed result to our `neighborhoods_bootstrap_cis` data frame
    new_obs <- data.frame(n, neighbourhood_ci_95$lower_ci, neighbourhood_ci_95$upper_ci)
    names(new_obs) <- columns
    
    # NOTE: this is line that actually does the "appending"; rbind binds rows by common columns
    neighborhoods_bootstrap_cis <- rbind(neighborhoods_bootstrap_cis, new_obs)
}

# Append the `captured` boolean is lower_ci and upper_ci contain 0, when treated as an interval
intervals_captured_95 <- neighborhoods_bootstrap_cis %>% 
    mutate(captured = (lower_ci <= 0 & 0 <= upper_ci))

# Explore our results
head(intervals_captured_95, n = 3)
tail(intervals_captured_95, n = 3)
create_caption(24, "Exploring all confidence intervals constructed via bootstapping")

[1] "Figure #24: Exploring all confidence intervals constructed via bootstapping"


              
                # Visualize whether or not our confidence intervals captured the true value
many_ci_plot_95 <- intervals_captured_95 %>%
    ggplot() +
    scale_colour_manual(breaks = c("TRUE", "FALSE"),
                        values = c("grey", "black")) +
    geom_segment(aes(x = lower_ci,
                     xend = upper_ci,
                     y = neighbourhood,
                     yend = neighbourhood,
                     color = captured)) +
    geom_vline(xintercept = 0, colour = "red", size = 1) +
    labs(title = "Confidence Intervals for Proportion of Theft Crime by Neighborhood (95%)",
         y = "Neighbourhood",
         x = "Confidence Interval",
         colour = "Captured?") +
    theme_stat201()

many_ci_plot_95
create_caption(25, "Visualizing confidence intervals constructed via bootstapping")

[1] "Figure #25: Visualizing confidence intervals constructed via bootstapping"


              
                set.seed(SEED)

# Compute needed summary statistics for a given neighborhood `n`, for before or after the pandemic, `p`
get_crime_data_summary <- function(n, p) {
    crime_data_filtered <- crime_data_processed %>% 
        filter(neighbourhood == n) %>%
        filter(period == p)

    crime_data_summary <- crime_data_filtered %>%
        summarize(prop = mean(is_theft),
                  s = sum(is_theft),
                  n = n())

    return (crime_data_summary)
}

head(get_crime_data_summary("Kitsilano", "Before pandemic"))
create_caption(26, "Showing example call to summary data helper function")

[1] "Figure #26: Showing example call to summary data helper function"


              
                # Initialize an empty tibble, as before
crime_data_past_summary <- tibble(neighbourhood = "", prop = 0, s = 0, n = 0)

# For every neighbourhood available
for (neighbourhood in neighbourhoods) {
    # Call helper function
    stats <- get_crime_data_summary(neighbourhood, "Before pandemic")
    
    # Pull the data we need
    prop <- pull(stats[1])
    s <-  pull(stats[2])
    n <- pull(stats[3])
    
    # Append a row to our resulting data frame
    crime_data_past_summary <- add_row(crime_data_past_summary, neighbourhood = neighbourhood, prop = prop, s = s, n = n)
}

crime_data_past_summary <- crime_data_past_summary[-1,]

# Explore our results
head(crime_data_past_summary, n = 3)
tail(crime_data_past_summary, n = 3)
create_caption(27, "Exploring summarized data for before the pandemic")

[1] "Figure #27: Exploring summarized data for before the pandemic"


              
                # Initialize an empty tibble, as before
crime_data_during_summary <- tibble(neighbourhood = "", prop = 0, s = 0, n = 0)

# For every neighbourhood available
for (neighbourhood in neighbourhoods) {
    # Call helper function
    stats <- get_crime_data_summary(neighbourhood, "During pandemic")
    
    # Pull the data we need
    prop <- pull(stats[1])
    s <-  pull(stats[2])
    n <- pull(stats[3])
    
    # Append a row to our resulting data frame
    crime_data_during_summary <- add_row(crime_data_during_summary, neighbourhood = neighbourhood, prop = prop, s = s, n = n)
}

crime_data_during_summary <- crime_data_during_summary[-1,]

# Explore our results
head(crime_data_during_summary, n = 3)
tail(crime_data_during_summary, n = 3)
create_caption(28, "Exploring summarized data for during the pandemic")

[1] "Figure #28: Exploring summarized data for during the pandemic"


              
                # We'll first need to the rename our column names to prevent collisions
crime_data_past_summary_tidy <- crime_data_past_summary %>%
    rename(past_prop = prop,
           past_s = s,
           past_n = n)
crime_data_during_summary_tidy <- crime_data_during_summary %>%
    rename(curr_prop = prop,
           curr_s = s,
           curr_n = n)

# Now, we left_join our two data frames (combining them by common neighbourhood names)
crime_data_merged <- left_join(crime_data_past_summary_tidy, crime_data_during_summary_tidy, by = "neighbourhood")


              
                # Merge our crime data and add a variable to verify if we met the CLT conditions
crime_data_merged <- crime_data_merged %>%
    mutate(met = past_n * past_prop >= 10 & past_n * (1 - past_prop) >= 10 & curr_n * curr_prop >= 10 & curr_n * (1 - curr_prop) >= 10)
crime_data_merged
create_caption(29, "Showing whether or not merged data meets CLT assumptions")

[1] "Figure #29: Showing whether or not merged data meets CLT assumptions"


              
                # Filter out the Musqueam neighborhood
crime_data_merged_clt <- crime_data_merged %>%
    filter(neighbourhood != "Musqueam")


              
                # 1. Compute the pooled proportion
crime_data_pooled <- crime_data_merged_clt %>%
    mutate(pooled_prop = ((curr_n * curr_prop) + (past_n * past_prop)) / (curr_n + past_n))

# 2., 3., 4., "Conduct the test"
p0_diff <- 0 
crime_data_stat <- crime_data_pooled %>% 
    mutate(stat = (curr_prop - past_prop - p0_diff) / sqrt(pooled_prop * (1 - pooled_prop) * (1 / curr_n + 1 / past_n)),
           # could just as well be done prop.test(...) %>% tidy(...)
           p_value = 2 * pnorm(-abs(stat), lower.tail = TRUE), # two-tailed test; normalize stat with -abs(...) so lower.tail=FALSE always works
           reject5 = p_value < 0.05)

head(crime_data_stat, n = 3)
tail(crime_data_stat, n = 3)
create_caption(30, "Exploring result of asymptotics hypothesis testing")

[1] "Figure #30: Exploring result of asymptotics hypothesis testing"


              
                # Produce summary statistics for our asymptotics hypothesis test
crime_data_stat_summary <- crime_data_stat %>%
    summarize(rejected = sum(reject5),
              non_rejected = sum(!reject5),
              n = n())

crime_data_stat_summary
create_caption(31, "A table containing the number of rejected and non-rejected null hypotheses")

print(sprintf("Using asymptotics %s%% neighbourhoods did see a statistically significant change in the proportion of theft-related crime, while the other %s%% neighbourhoods did not.", 
              format(round(crime_data_stat_summary$rejected / crime_data_stat_summary$n * 100, 2), nsmall = 2), 
              format(round(crime_data_stat_summary$non_rejected / crime_data_stat_summary$n * 100, 2), nsmall = 2)))

[1] "Figure #31: A table containing the number of rejected and non-rejected null hypotheses"
[1] "Using asymptotics 47.83% neighbourhoods did see a statistically significant change in the proportion of theft-related crime, while the other 52.17% neighbourhoods did not."


              
                # Compute the needed statistics from above
crime_data_ci <- crime_data_merged_clt %>%
    mutate(diff_prop = curr_prop - past_prop,
           diff_prop_se = sqrt((curr_prop * (1 - curr_prop) / curr_n) + (past_prop * (1 - past_prop) / past_n)))

# Compute the interval using qnorm
crime_data_ci_95 <- crime_data_ci %>%
    mutate(lower_ci = qnorm(0.025, diff_prop, diff_prop_se),
           upper_ci = qnorm(0.975, diff_prop, diff_prop_se)) %>%
    select(neighbourhood, lower_ci, upper_ci)

# Indicate whether or not an interval captued 0
intervals_captured_95 <- crime_data_ci_95 %>% 
    mutate(captured = (lower_ci <= 0 & 0 <= upper_ci))

head(intervals_captured_95, n = 3)
tail(intervals_captured_95, n = 3)
create_caption(32, "Exploring confidence intervals generated via asymptotics")

[1] "Figure #32: Exploring confidence intervals generated via asymptotics"


              
                # Again, visualize if our confidence intervals captured zero
many_ci_plot_95 <- intervals_captured_95 %>%
    ggplot() +
    scale_colour_manual(breaks = c("TRUE", "FALSE"),
                        values = c("grey", "black")) +
    geom_segment(aes(x = lower_ci,
                     xend = upper_ci,
                     y = neighbourhood,
                     yend = neighbourhood,
                     colour = captured)) +
    geom_vline(xintercept = 0, colour = "red", size = 1) +
    labs(title = "Confidence Intervals for Proportion of Theft Crime by Neighborhood (95%)",
         y = "Neighbourhood",
         x = "Confidence Interval",
         colour = "Captured?") +
    theme_stat201(10, 12)

many_ci_plot_95
create_caption(33, "Visualizing confidence intervals generated via asymptotics")

[1] "Figure #33: Visualizing confidence intervals generated via asymptotics"


              
                # Print our results again
neighbourhoods_bootstrap_result
create_caption(34, "A table showing our final P-values, found via bootstrapping.")

[1] "Figure #34: A table showing our final P-values, found via bootstrapping."


              
                # Delete unzipped files, and the zip archive
unlink("*.csv")
unlink("*.txt")
unlink("*.pdf")
unlink("*.zip")


              
                # Read in data from the City of Vancouver "opendata" website
url <- "https://webtransfer.vancouver.ca/opendata/csv/CensusLocalAreaProfiles2016.csv"
census <- read_csv(url, skip = 4) %>%
    clean_names()

head(census)
create_caption(35, "Exploring Vancouver census data")

-- Column specification ------------------------------------------------------------------------------------------------
cols(
  .default = col_number(),
  ID = col_double(),
  Variable = col_character()
)
i Use `spec()` for the full column specifications.


Warning message:
"1 parsing failure.
 row col               expected         actual                                                                            file
1879  ID no trailing characters 25% Data Below 'https://webtransfer.vancouver.ca/opendata/csv/CensusLocalAreaProfiles2016.csv'
"

[1] "Figure #35: Exploring Vancouver census data"


              
                # This the the question in the `variable` column we care about
INCOME_QUESTION <- "Average total income in 2015 among recipients ($)"

# Grab all neighborhoods' average incomes
neighbourhood_income <- census %>%
    filter(variable == INCOME_QUESTION) %>%
    head(1) %>% # we're only interested in the first row; the subsequent rows with this `variable` contain data for only a segment of the population (e.g., males)
    gather(neighborhood, income, arbutus_ridge:vancouver_cma) %>% # convert "wide" data to long with `gather`
    select(-variable, -id) # drop columns we're not interested in 

head(neighbourhood_income)
create_caption(36, "Explored tidied census data")

[1] "Figure #36: Explored tidied census data"


              
                # We can gather some summary statistics as well
neighbourhood_income_summary <- neighbourhood_income %>%
    summarize(max = max(income),
              min = min(income),
              mean = mean(income),
              median = median(income))

neighbourhood_income_summary
create_caption(37, "A table containing summary statistics for Vancouver census data")

[1] "Figure #37: A table containing summary statistics for Vancouver census data"


              
                # Visualize income data on a histogram
neighbourhood_income_histogram <- neighbourhood_income %>%
    ggplot(aes(x = income)) +
    geom_histogram(bins = 15) +
    labs(
        title = "Histogram for Vancouver Neighbourhood Average Incomes in 2016\n",
        x = "Income ($)",
        y = "Frequency"
    ) +
    theme_stat201(12, 6)

neighbourhood_income_histogram
create_caption(38, "A histogram of wealth across Vancouver neighborhoods")

[1] "Figure #38: A histogram of wealth across Vancouver neighborhoods"


              
                # Visualize income data on a barplot
neighbourhood_income_barplot <- neighbourhood_income %>%
    ggplot(aes(x = reorder(neighborhood, income), y = income)) +
    geom_bar(stat = "identity") +
    labs(
        title = "Bar Chart for Vancouver Neighbourhood Average Incomes in 2016\n",
        x = "Neighborhood",
        y = "Income ($)"
    ) +
    coord_flip() +
    theme_stat201(12, 6)
neighbourhood_income_barplot
create_caption(39, "A bar plot of wealth across Vancouver neighborhoods")

[1] "Figure #39: A bar plot of wealth across Vancouver neighborhoods"

Neighborhood	Average Income (2016, CAD)
Arbutus Ridge	\$62,675
Downtown	\$63,251
Dunbar/Southlands	\$78,117
Fairview	\$61,627
Grandview/Woodland	\$42,896
Hastings/Sunrise	\$38,258

Column	Description	Notes
Year	"A four-digit field that indicates the year when the reported crime activity"	We'll use data, specifically, from January 1st, 2017 to December 31st, 2020.
Type	"The type of crime activities"	We're considering theft crime to be all crime types with "theft" explicitly in the name.
Neighborhood	"Neighborhoods within the City of Vancouver are based on the census tract (CT) concept within census metropolitan area (CMA)."	We'll take three neighborhoods, representing the full economic spectrum.

type	year	month	day	hour	minute	hundred_block	neighbourhood	x	y
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>
Break and Enter Commercial	2012	12	14	8	52	NA	Oakridge	491285.0	5453433
Break and Enter Commercial	2019	3	7	2	6	10XX SITKA SQ	Fairview	490613.0	5457110
Break and Enter Commercial	2019	8	27	4	12	10XX ALBERNI ST	West End	491007.8	5459174
Break and Enter Commercial	2014	8	8	5	13	10XX ALBERNI ST	West End	491015.9	5459166
Break and Enter Commercial	2020	7	28	19	12	10XX ALBERNI ST	West End	491015.9	5459166
Break and Enter Commercial	2005	11	14	3	9	10XX ALBERNI ST	West End	491021.4	5459161

Column	Description
`neighbourhood`	The neighbourhood the crimes took place in.
`past_prop`	The proportion of reported, theft-related crimes to all reported crimes from 2017 to 2019.
`current_prop`	The proportion of reported, theft-related crimes to all reported crimes in 2020.
`diff_in_props`	Our test statistic, equal to the difference of `current_prop` and `past_prop`.
`p_value`	The likelihood of observing our test statistic under the null model, where there is no difference in the proportion of theft-related crimes..

	neighbourhood	past_prop	current_prop	diff_in_props	p_value
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>
1	Fairview	0.6891537	0.6194740	-0.06967969	0
2	West End	0.7351588	0.6331310	-0.10202785	0
3	Central Business District	0.7533982	0.6046934	-0.14870486	0

Introduction¶

Background¶

Question¶

Data set¶

Preliminary Results¶

Methods¶

Strengths¶

Limitations¶

Analysis¶

Results¶

Bootstrapping¶

Confidence Intervals via Bootstrapping¶

Asymptotics¶

Confidence Intervals via Asymptotics¶

Bootstrapping or Asymptotics?¶

Discussion¶

Reflection¶

References¶

Appendix

type	year	neighbourhood
<fct>	<dbl>	<fct>
Break and Enter Commercial	2019	Fairview
Break and Enter Commercial	2019	West End
Break and Enter Commercial	2020	West End
Break and Enter Commercial	2017	West End
Break and Enter Commercial	2018	West End
Break and Enter Commercial	2020	West End

neighbourhood	year	n
<fct>	<dbl>	<int>
Fairview	2017	1768
Fairview	2018	1795
Fairview	2019	2144
Fairview	2020	1787
West End	2017	2825
West End	2018	3029

neighbourhood	year	is_theft
<fct>	<dbl>	<lgl>
Fairview	2019	FALSE
West End	2019	FALSE
West End	2020	FALSE

neighborhood	income
<chr>	<dbl>
arbutus_ridge	62675
downtown	63251
dunbar_southlands	78117
fairview	61627
grandview_woodland	42896
hastings_sunrise	38258

A tibble: 6 × 3
neighbourhood	prop_theft	time
<fct>	<dbl>	<fct>
Fairview	0.6885614	Before pandemic
West End	0.7348163	Before pandemic
Central Business District	0.7525392	Before pandemic
Hastings-Sunrise	0.6449970	Before pandemic
Strathcona	0.5578307	Before pandemic
Grandview-Woodland	0.6233940	Before pandemic

A data.frame: 24 × 6
neighbourhood	past_prop	current_prop	diff_in_props	p_value	reject5
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<lgl>
Fairview	0.6891537	0.6194740	-0.069679692	0.000	TRUE
West End	0.7351588	0.6331310	-0.102027854	0.000	TRUE
Central Business District	0.7533982	0.6046934	-0.148704859	0.000	TRUE
Hastings-Sunrise	0.6451613	0.5693642	-0.075797128	0.000	TRUE
Strathcona	0.5577118	0.4385246	-0.119187188	0.000	TRUE
Grandview-Woodland	0.6237392	0.5506849	-0.073054262	0.000	TRUE
Mount Pleasant	0.6768553	0.5719462	-0.104909124	0.000	TRUE
Sunset	0.5896720	0.5651163	-0.024555738	0.224	FALSE
Kensington-Cedar Cottage	0.6043016	0.6161616	0.011860044	0.440	FALSE
Stanley Park	0.7840000	0.5074627	-0.276537313	0.000	TRUE
Shaughnessy	0.4612766	0.4364641	-0.024812507	0.416	FALSE
Marpole	0.5688531	0.5661664	-0.002686649	0.936	FALSE
Oakridge	0.5432873	0.4968354	-0.046451884	0.142	FALSE
Kitsilano	0.6463512	0.6171575	-0.029193753	0.050	FALSE
Victoria-Fraserview	0.6036842	0.5797101	-0.023974066	0.380	FALSE
Kerrisdale	0.4675528	0.5106952	0.043142412	0.160	FALSE
West Point Grey	0.5674500	0.5804878	0.013037848	0.756	FALSE
Riley Park	0.6152695	0.5239437	-0.091325799	0.000	TRUE
Arbutus Ridge	0.5178026	0.4740061	-0.043796529	0.200	FALSE
Renfrew-Collingwood	0.6848298	0.7136490	0.028819192	0.018	TRUE
Killarney	0.6045561	0.5986395	-0.005916619	0.884	FALSE
South Cambie	0.6170412	0.6040609	-0.012980285	0.688	FALSE
Dunbar-Southlands	0.5390335	0.5897436	0.050710132	0.066	FALSE
Musqueam	0.5111111	0.1739130	-0.337198068	0.014	TRUE

A tibble: 24 × 8
neighbourhood	past_prop	past_s	past_n	curr_prop	curr_s	curr_n	met
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<lgl>
Fairview	0.6891537	3933	5707	0.6194740	1107	1787	TRUE
West End	0.7351588	6712	9130	0.6331310	1460	2306	TRUE
Central Business District	0.7533982	24997	33179	0.6046934	4664	7713	TRUE
Hastings-Sunrise	0.6451613	2500	3875	0.5693642	591	1038	TRUE
Strathcona	0.5577118	3305	5926	0.4385246	749	1708	TRUE
Grandview-Woodland	0.6237392	3463	5552	0.5506849	804	1460	TRUE
Mount Pleasant	0.6768553	4916	7263	0.5719462	1105	1932	TRUE
Sunset	0.5896720	1690	2866	0.5651163	486	860	TRUE
Kensington-Cedar Cottage	0.6043016	2613	4324	0.6161616	793	1287	TRUE
Stanley Park	0.7840000	392	500	0.5074627	34	67	TRUE
Shaughnessy	0.4612766	542	1175	0.4364641	158	362	TRUE
Marpole	0.5688531	1483	2607	0.5661664	415	733	TRUE
Oakridge	0.5432873	866	1594	0.4968354	157	316	TRUE
Kitsilano	0.6463512	3224	4988	0.6171575	964	1562	TRUE
Victoria-Fraserview	0.6036842	1147	1900	0.5797101	280	483	TRUE
Kerrisdale	0.4675528	598	1279	0.5106952	191	374	TRUE
West Point Grey	0.5674500	652	1149	0.5804878	238	410	TRUE
Riley Park	0.6152695	1644	2672	0.5239437	372	710	TRUE
Arbutus Ridge	0.5178026	509	983	0.4740061	155	327	TRUE
Renfrew-Collingwood	0.6848298	3783	5524	0.7136490	1281	1795	TRUE
Killarney	0.6045561	1035	1712	0.5986395	264	441	TRUE
South Cambie	0.6170412	659	1068	0.6040609	238	394	TRUE
Dunbar-Southlands	0.5390335	725	1345	0.5897436	253	429	TRUE
Musqueam	0.5111111	23	45	0.1739130	4	23	FALSE

A tibble: 3 × 12
neighbourhood	past_prop	past_s	past_n	curr_prop	curr_s	curr_n	met	pooled_prop	stat	p_value	reject5
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<lgl>	<dbl>	<dbl>	<dbl>	<lgl>
Killarney	0.6045561	1035	1712	0.5986395	264	441	TRUE	0.6033442	-0.2264818	0.82082672	FALSE
South Cambie	0.6170412	659	1068	0.6040609	238	394	TRUE	0.6135431	-0.4522423	0.65109442	FALSE
Dunbar-Southlands	0.5390335	725	1345	0.5897436	253	429	TRUE	0.5512965	1.8388037	0.06594406	FALSE

Examining the Impact of COVID-19 on the Proportion of Theft-Related Crimes in Vancouver¶

Introduction¶

Background¶

Question¶

Data set¶

Preliminary Results¶

Methods¶

Strengths¶

Limitations¶

Analysis¶

Results¶

Bootstrapping¶

Confidence Intervals via Bootstrapping¶

Asymptotics¶

Confidence Intervals via Asymptotics¶

Bootstrapping or Asymptotics?¶

Discussion¶

Reflection¶

References¶

Appendix