marcofanti · June 7, 2025 01:25
diff --git a/Week3.Rmd b/Week3.Rmd
 ---
 title: "COVID-19 Data Analysis - Week 3 Project"
 author: "Marco F"
 date: "`r Sys.Date()`"
 output:
  html_document:
    toc: true
    toc_float: true
    theme: flatly
    highlight: tango
    code_folding: show
  pdf_document:
    toc: true
    number_sections: true
 ---

 # Introduction

 This document analyzes COVID-19 time series data from the Johns Hopkins CSSE repository. The analysis includes both global and US-specific data, focusing on confirmed cases and deaths over time.

 # Setup and Data Loading

 ## Load Required Libraries

 ```{r libraries}
 library(readr)
 library(dplyr)
 library(tidyverse)
 library(lubridate)
 library(ggplot2)
 library(knitr)
 ```

 ## Define Data Sources

 ```{r data-sources}
 # Define the base URL for the raw CSV files from GitHub
 base_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

 # Define the file names
 files <- list(
  global_confirmed = "time_series_covid19_confirmed_global.csv",
  global_deaths = "time_series_covid19_deaths_global.csv",
  us_confirmed = "time_series_covid19_confirmed_US.csv",
  us_deaths = "time_series_covid19_deaths_US.csv"
 )
 ```

 ## Data Reading Function

 ```{r safe-read-function}
 # Function to safely read CSV with error handling
 safe_read_csv <- function(url) {
  tryCatch({
    cat("Reading:", url, "\n")
    data <- read_csv(url, show_col_types = FALSE)
    cat("Successfully loaded", nrow(data), "rows and", ncol(data), "columns\n")
    return(data)
  }, error = function(e) {
    cat("Error reading", url, ":", e$message, "\n")
    return(NULL)
  })
 }
 ```

 ## Download Data

 ```{r download-data}
 cat("Starting to download COVID-19 data from Johns Hopkins CSSE repository...\n\n")

 # Global confirmed cases
 global_cases <- safe_read_csv(paste0(base_url, files$global_confirmed))

 # Global deaths
 global_deaths <- safe_read_csv(paste0(base_url, files$global_deaths))

 # US confirmed cases
 us_cases <- safe_read_csv(paste0(base_url, files$us_confirmed))

 # US deaths
 us_deaths <- safe_read_csv(paste0(base_url, files$us_deaths))
 ```

 # Data Cleaning and Transformation

 ## Global Data Processing

 ### Clean Global Cases Data

 ```{r clean-global-cases}
 global_cases <- global_cases %>%
  # Make column names R-friendly
  rename(
    province_state = `Province/State`,
    country_region = `Country/Region`
  ) %>%
  # Remove unnecessary columns
  select(-c(Lat, Long)) %>%
  # Pivot to tidy format
  pivot_longer(
    cols = -c(province_state, country_region),
    names_to = "date",
    values_to = "cases",
    values_transform = list(cases = as.numeric)
  ) %>%
  # Remove NA values
  filter(!is.na(cases))
 ```

 ### Clean Global Deaths Data

 ```{r clean-global-deaths}
 global_deaths <- global_deaths %>%
  # Make column names R-friendly
  rename(
    province_state = `Province/State`,
    country_region = `Country/Region`
  ) %>%
  # Remove unnecessary columns
  select(-c(Lat, Long)) %>%
  # Pivot to tidy format
  pivot_longer(
    cols = -c(province_state, country_region),
    names_to = "date",
    values_to = "deaths",
    values_transform = list(deaths = as.numeric)
  ) %>%
  # Remove NA values
  filter(!is.na(deaths))
 ```

 ### Combine Global Data

 ```{r combine-global}
 global <- global_cases %>%
  full_join(global_deaths, by = c("province_state", "country_region", "date")) %>%
  mutate(date = mdy(date)) %>%
  filter(cases > 0)

 # Display summary
 cat("Global data summary:\n")
 summary(global)
 ```

 ## US Data Processing

 ### Clean US Cases Data

 ```{r clean-us-cases}
 us_cases <- us_cases %>%
  pivot_longer(
    cols = -c(UID:Combined_Key),
    names_to = "date",
    values_to = "cases"
  ) %>%
  select(Admin2:cases) %>%
  mutate(date = mdy(date)) %>%
  select(-c(Lat, Long_))
 ```

 ### Clean US Deaths Data

 ```{r clean-us-deaths}
 us_deaths <- us_deaths %>%
  pivot_longer(
    cols = -c(UID:Population),
    names_to = "date",
    values_to = "deaths"
  ) %>%
  select(Admin2:deaths) %>%
  mutate(date = mdy(date)) %>%
  select(-c(Lat, Long_))
 ```

 ### Combine US Data

 ```{r combine-us}
 us <- us_cases %>%
  full_join(us_deaths) %>%
  filter(cases > 0)

 cat("US data summary:\n")
 summary(us)
 ```

 ## Enhance Global Data with Population Information

 ```{r enhance-global}
 # Create Combined_Key for global data
 global <- global %>%
  unite("Combined_Key",
    c(province_state, country_region),
    sep = ", ",
    na.rm = TRUE,
    remove = FALSE
  )

 # Standardize column names
 colnames(global) <- c("Combined_Key", "Province_State", "Country_Region", "date", "cases", "deaths")

 # Load population lookup table
 uid_lookup_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/refs/heads/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
 uid_lookup <- safe_read_csv(uid_lookup_url)

 # Clean lookup table
 uid_lookup <- uid_lookup %>%
  select(-c(Lat, Long_, Combined_Key, code3, iso2, iso3, Admin2))

 # Join with global data
 global <- global %>%
  left_join(uid_lookup, by = c("Province_State", "Country_Region")) %>%
  select(-c(UID, FIPS)) %>%
  select(Province_State, Country_Region, date, cases, deaths, Population, Combined_Key)
 ```

 # Data Aggregation

 ## US State-Level Data

 ```{r us-by-state}
 US_by_state <- us %>%
  group_by(Province_State, Country_Region, date) %>%
  summarize(
    cases = sum(cases),
    deaths = sum(deaths),
    Population = sum(Population),
    .groups = "drop"
  ) %>%
  mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
  select(Province_State, Country_Region, date, cases, deaths, deaths_per_mill, Population)
 ```

 ## US National Totals

 ```{r us-totals}
 US_totals <- US_by_state %>%
  group_by(Country_Region, date) %>%
  summarize(
    cases = sum(cases),
    deaths = sum(deaths),
    Population = sum(Population),
    .groups = "drop"
  ) %>%
  mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
  select(Country_Region, date, cases, deaths, deaths_per_mill, Population)

 # Display the first few rows
 head(US_totals) %>% kable(caption = "US COVID-19 Totals (First 6 rows)")
 ```

 # Data Visualization

 ## US COVID-19 Cases and Deaths Over Time

 ```{r us-covid-plot, fig.cap="COVID-19 Cases and Deaths in the United States (Log Scale)"}
 US_totals %>%
  filter(cases > 0) %>%
  ggplot(aes(x = date)) +
  geom_line(aes(y = cases, color = "Cases"), size = 1) +
  geom_point(aes(y = cases, color = "Cases"), alpha = 0.7) +
  geom_line(aes(y = deaths, color = "Deaths"), size = 1) +
  geom_point(aes(y = deaths, color = "Deaths"), alpha = 0.7) +
  scale_y_log10(labels = scales::comma_format()) +
  scale_color_manual(
    values = c("Cases" = "#2E86AB", "Deaths" = "#A23B72"),
    name = "Metric"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray60")
  ) +
  labs(
    title = "COVID-19 Cases and Deaths in the United States",
    subtitle = "Cumulative totals on logarithmic scale",
    x = "Date",
    y = "Count (Log Scale)",
    caption = "Data source: Johns Hopkins CSSE COVID-19 Data Repository"
  )
 ```

 ```{r ny-covid-plot, fig.cap="COVID-19 Cases and Deaths in NY state (Log Scale)"}
 state <- "New York"
 US_by_state %>%
  filter(Province_State == state) %>%
  filter(cases > 0) %>%
  ggplot(aes(x = date, y = cases)) +
  geom_line(aes(y = cases, color = "Cases"), size = 1) +
  geom_point(aes(color = "Cases"), alpha = 0.7) +
  geom_line(aes(y = deaths, color = "Deaths"), size = 1) +
  geom_point(aes(y = deaths, color = "Deaths"), alpha = 0.7) +
  scale_y_log10() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = str_c("COVID-19 Cases and Deaths in ", state), y = NULL)

 ```

 Let's first examine the data range and totals:

 ```{r data-summary}
 # Check the date range and maximum values
 cat("Latest date in dataset:", as.character(max(US_totals$date)), "\n")
 cat("Maximum total deaths:", format(max(US_totals$deaths), big.mark = ","), "\n")
 ```

 ## Calculate New Daily Cases and Deaths

 We'll calculate the daily new cases and deaths by taking the difference from the previous day:

 ```{r calculate-new-cases}
 # Calculate new daily cases and deaths for state data
 US_by_state <- US_by_state %>%
  arrange(Province_State, date) %>%
  group_by(Province_State) %>%
  mutate(
    new_cases = cases - lag(cases, default = 0),
    new_deaths = deaths - lag(deaths, default = 0)
  ) %>%
  ungroup()

 # Calculate new daily cases and deaths for national totals
 US_totals <- US_totals %>%
  arrange(date) %>%
  mutate(
    new_cases = cases - lag(cases, default = 0),
    new_deaths = deaths - lag(deaths, default = 0)
  )
 ```

 ## National Trends: Daily New Cases and Deaths

 ```{r national-trends, fig.cap="Daily new COVID-19 cases and deaths in the United States"}
 US_totals %>%
  filter(date >= as.Date("2020-03-01")) %>%  # Start from March 2020
  ggplot(aes(x = date)) +
  geom_line(aes(y = new_cases, color = "New Cases"), size = 1) +
  geom_point(aes(y = new_cases, color = "New Cases"), alpha = 0.6) +
  geom_line(aes(y = new_deaths, color = "New Deaths"), size = 1) +
  geom_point(aes(y = new_deaths, color = "New Deaths"), alpha = 0.6) +
  scale_y_log10(labels = scales::comma_format()) +
  scale_color_manual(
    values = c("New Cases" = "#2E86AB", "New Deaths" = "#A23B72"),
    name = "Metric"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(size = 14, face = "bold")
  ) +
  labs(
    title = "COVID-19 Daily New Cases and Deaths in the US",
    subtitle = "Logarithmic scale showing pandemic waves",
    x = "Date",
    y = "Count (Log Scale)",
    caption = "Data source: Johns Hopkins CSSE COVID-19 Data Repository"
  )
 ```

 ## State-Level Analysis: New York

 Let's examine the trends for a specific state. We'll use New York as an example since it was heavily impacted early in the pandemic:

 ```{r ny-state-analysis, fig.cap="Daily new COVID-19 cases and deaths in New York State"}
 # Define the state for analysis
 state <- "New York"

 US_by_state %>%
  filter(
    Province_State == state,
    date >= as.Date("2020-03-01"),
    new_cases >= 0  # Filter out negative values from data corrections
  ) %>%
  ggplot(aes(x = date)) +
  geom_line(aes(y = new_cases, color = "New Cases"), size = 1) +
  geom_point(aes(y = new_cases, color = "New Cases"), alpha = 0.6) +
  geom_line(aes(y = new_deaths, color = "New Deaths"), size = 1) +
  geom_point(aes(y = new_deaths, color = "New Deaths"), alpha = 0.6) +
  scale_y_log10(labels = scales::comma_format()) +
  scale_color_manual(
    values = c("New Cases" = "#2E86AB", "New Deaths" = "#A23B72"),
    name = "Metric"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(size = 14, face = "bold")
  ) +
  labs(
    title = str_c("COVID-19 Daily New Cases and Deaths in ", state),
    subtitle = "Logarithmic scale showing state-level pandemic trends",
    x = "Date",
    y = "Count (Log Scale)",
    caption = "Data source: Johns Hopkins CSSE COVID-19 Data Repository"
  )
 ```

 ## State Totals and Per-Capita Analysis

 Now let's create a summary table with total cases and deaths by state, along with per-capita metrics:

 ```{r state-totals}
 US_state_totals <- US_by_state %>%
  group_by(Province_State) %>%
  summarize(
    deaths = max(deaths, na.rm = TRUE),
    cases = max(cases, na.rm = TRUE),
    population = max(Population, na.rm = TRUE),
    cases_per_thou = 1000 * cases / population,
    deaths_per_thou = 1000 * deaths / population,
    .groups = "drop"
  ) %>%
  filter(cases > 0, population > 0)

 # Display summary statistics
 cat("State-level summary statistics:\n")
 cat("Number of states/territories:", nrow(US_state_totals), "\n")
 cat("Total US cases:", format(sum(US_state_totals$cases), big.mark = ","), "\n")
 cat("Total US deaths:", format(sum(US_state_totals$deaths), big.mark = ","), "\n")
 ```

 # State-Level Comparative Analysis

 ## Top 10 States by Cases and Deaths per Thousand

 ```{r top-states-analysis}
 # Top 10 states by cases per thousand
 top_cases <- US_state_totals %>%
  arrange(desc(cases_per_thou)) %>%
  head(10) %>%
  select(Province_State, cases_per_thou, deaths_per_thou, population)

 # Top 10 states by deaths per thousand  
 top_deaths <- US_state_totals %>%
  arrange(desc(deaths_per_thou)) %>%
  head(10) %>%
  select(Province_State, cases_per_thou, deaths_per_thou, population)

 cat("Top 10 States by Cases per Thousand:\n")
 print(top_cases)

 cat("\nTop 10 States by Deaths per Thousand:\n")
 print(top_deaths)
 ```

 ## Statistical Summary and Key Insights

 ```{r statistical-summary}
 # National statistics
 national_stats <- US_totals %>%
  filter(date == max(date)) %>%
  slice(1)

 # State statistics
 state_stats <- US_state_totals %>%
  summarize(
    total_states = n(),
    avg_cases_per_thou = mean(cases_per_thou),
    median_cases_per_thou = median(cases_per_thou),
    avg_deaths_per_thou = mean(deaths_per_thou),
    median_deaths_per_thou = median(deaths_per_thou),
  )

 cat("=== NATIONAL SUMMARY ===\n")
 cat("Total Cases:", format(national_stats$cases, big.mark = ","), "\n")
 cat("Total Deaths:", format(national_stats$deaths, big.mark = ","), "\n")
 cat("Overall CFR:", round(national_stats$deaths / national_stats$cases * 100, 2), "%\n")
 cat("Deaths per Million:", round(national_stats$deaths_per_mill, 2), "\n\n")

 cat("=== STATE-LEVEL SUMMARY ===\n")
 cat("Number of States/Territories:", state_stats$total_states, "\n")
 cat("Average Cases per Thousand:", round(state_stats$avg_cases_per_thou, 2), "\n")
 cat("Median Cases per Thousand:", round(state_stats$median_cases_per_thou, 2), "\n")
 cat("Average Deaths per Thousand:", round(state_stats$avg_deaths_per_thou, 2), "\n")
 cat("Median Deaths per Thousand:", round(state_stats$median_deaths_per_thou, 2), "\n")

 ```

 ## Summary of Findings

 This analysis of COVID-19 data from the Johns Hopkins CSSE repository reveals significant insights about the pandemic's impact across the United States. The examination of both national trends and state-level variations demonstrates the heterogeneous nature of pandemic spread across different populations and geographies. The United States experienced multiple distinct waves of COVID-19 infections, with clear peaks and valleys in both case and death counts effectively captured through logarithmic scale visualizations. Significant variation exists across states in terms of cases and deaths per thousand population, likely reflecting differences in population density, demographics, healthcare infrastructure, and policy responses. The correlation between cases per capita and deaths per capita across states suggests that while some variation exists in case fatality rates, the fundamental relationship between infection spread and mortality remains consistent at the population level.

 ## Limitations and Sources of Bias

 The primary limitation of this analysis stems from temporal bias in interpretation. Analyzing COVID-19 data in 2025 with full knowledge of how the pandemic evolved creates an inherent bias in interpreting early trends through the lens of later developments. This retrospective knowledge may lead to over-interpretation of patterns that seemed less significant at the time, or conversely, may cause us to undervalue the uncertainty and fear that characterized early pandemic decision-making. Additionally, the data itself contains reporting delays, weekend effects, and evolving case definitions that affect the reliability of day-to-day comparisons.

 ## Personal Bias and Mitigation

 As an analyst examining this data with the benefit of hindsight, I carry temporal bias that influences my interpretation of early pandemic trends. Knowing the eventual trajectory of cases, deaths, and policy responses, I may unconsciously interpret initial data patterns as more predictable or logical than they actually were at the time. To mitigate this bias, I have focused on transparent methodology by clearly documenting all data processing steps and analytical choices, avoided making causal claims, and emphasized descriptive patterns rather than explanatory theories that might be influenced by hindsight knowledge.

 ```{r session-info, include=FALSE}
 # Document the R environment for reproducibility
 sessionInfo()
 ```
	---
	title: "COVID-19 Data Analysis - Week 3 Project"
	author: "Marco F"
	date: "`r Sys.Date()`"
	output:
	html_document:
	toc: true
	toc_float: true
	theme: flatly
	highlight: tango
	code_folding: show
	pdf_document:
	toc: true
	number_sections: true
	---

	# Introduction

	This document analyzes COVID-19 time series data from the Johns Hopkins CSSE repository. The analysis includes both global and US-specific data, focusing on confirmed cases and deaths over time.

	# Setup and Data Loading

	## Load Required Libraries

	```{r libraries}
	library(readr)
	library(dplyr)
	library(tidyverse)
	library(lubridate)
	library(ggplot2)
	library(knitr)
	```

	## Define Data Sources

	```{r data-sources}
	# Define the base URL for the raw CSV files from GitHub
	base_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"

	# Define the file names
	files <- list(
	global_confirmed = "time_series_covid19_confirmed_global.csv",
	global_deaths = "time_series_covid19_deaths_global.csv",
	us_confirmed = "time_series_covid19_confirmed_US.csv",
	us_deaths = "time_series_covid19_deaths_US.csv"
	)
	```

	## Data Reading Function

	```{r safe-read-function}
	# Function to safely read CSV with error handling
	safe_read_csv <- function(url) {
	tryCatch({
	cat("Reading:", url, "\n")
	data <- read_csv(url, show_col_types = FALSE)
	cat("Successfully loaded", nrow(data), "rows and", ncol(data), "columns\n")
	return(data)
	}, error = function(e) {
	cat("Error reading", url, ":", e$message, "\n")
	return(NULL)
	})
	}
	```

	## Download Data

	```{r download-data}
	cat("Starting to download COVID-19 data from Johns Hopkins CSSE repository...\n\n")

	# Global confirmed cases
	global_cases <- safe_read_csv(paste0(base_url, files$global_confirmed))

	# Global deaths
	global_deaths <- safe_read_csv(paste0(base_url, files$global_deaths))

	# US confirmed cases
	us_cases <- safe_read_csv(paste0(base_url, files$us_confirmed))

	# US deaths
	us_deaths <- safe_read_csv(paste0(base_url, files$us_deaths))
	```

	# Data Cleaning and Transformation

	## Global Data Processing

	### Clean Global Cases Data

	```{r clean-global-cases}
	global_cases <- global_cases %>%
	# Make column names R-friendly
	rename(
	province_state = `Province/State`,
	country_region = `Country/Region`
	) %>%
	# Remove unnecessary columns
	select(-c(Lat, Long)) %>%
	# Pivot to tidy format
	pivot_longer(
	cols = -c(province_state, country_region),
	names_to = "date",
	values_to = "cases",
	values_transform = list(cases = as.numeric)
	) %>%
	# Remove NA values
	filter(!is.na(cases))
	```

	### Clean Global Deaths Data

	```{r clean-global-deaths}
	global_deaths <- global_deaths %>%
	# Make column names R-friendly
	rename(
	province_state = `Province/State`,
	country_region = `Country/Region`
	) %>%
	# Remove unnecessary columns
	select(-c(Lat, Long)) %>%
	# Pivot to tidy format
	pivot_longer(
	cols = -c(province_state, country_region),
	names_to = "date",
	values_to = "deaths",
	values_transform = list(deaths = as.numeric)
	) %>%
	# Remove NA values
	filter(!is.na(deaths))
	```

	### Combine Global Data

	```{r combine-global}
	global <- global_cases %>%
	full_join(global_deaths, by = c("province_state", "country_region", "date")) %>%
	mutate(date = mdy(date)) %>%
	filter(cases > 0)

	# Display summary
	cat("Global data summary:\n")
	summary(global)
	```

	## US Data Processing

	### Clean US Cases Data

	```{r clean-us-cases}
	us_cases <- us_cases %>%
	pivot_longer(
	cols = -c(UID:Combined_Key),
	names_to = "date",
	values_to = "cases"
	) %>%
	select(Admin2:cases) %>%
	mutate(date = mdy(date)) %>%
	select(-c(Lat, Long_))
	```

	### Clean US Deaths Data

	```{r clean-us-deaths}
	us_deaths <- us_deaths %>%
	pivot_longer(
	cols = -c(UID:Population),
	names_to = "date",
	values_to = "deaths"
	) %>%
	select(Admin2:deaths) %>%
	mutate(date = mdy(date)) %>%
	select(-c(Lat, Long_))
	```

	### Combine US Data

	```{r combine-us}
	us <- us_cases %>%
	full_join(us_deaths) %>%
	filter(cases > 0)

	cat("US data summary:\n")
	summary(us)
	```

	## Enhance Global Data with Population Information

	```{r enhance-global}
	# Create Combined_Key for global data
	global <- global %>%
	unite("Combined_Key",
	c(province_state, country_region),
	sep = ", ",
	na.rm = TRUE,
	remove = FALSE
	)

	# Standardize column names
	colnames(global) <- c("Combined_Key", "Province_State", "Country_Region", "date", "cases", "deaths")

	# Load population lookup table
	uid_lookup_url <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/refs/heads/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv"
	uid_lookup <- safe_read_csv(uid_lookup_url)

	# Clean lookup table
	uid_lookup <- uid_lookup %>%
	select(-c(Lat, Long_, Combined_Key, code3, iso2, iso3, Admin2))

	# Join with global data
	global <- global %>%
	left_join(uid_lookup, by = c("Province_State", "Country_Region")) %>%
	select(-c(UID, FIPS)) %>%
	select(Province_State, Country_Region, date, cases, deaths, Population, Combined_Key)
	```

	# Data Aggregation

	## US State-Level Data

	```{r us-by-state}
	US_by_state <- us %>%
	group_by(Province_State, Country_Region, date) %>%
	summarize(
	cases = sum(cases),
	deaths = sum(deaths),
	Population = sum(Population),
	.groups = "drop"
	) %>%
	mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
	select(Province_State, Country_Region, date, cases, deaths, deaths_per_mill, Population)
	```

	## US National Totals

	```{r us-totals}
	US_totals <- US_by_state %>%
	group_by(Country_Region, date) %>%
	summarize(
	cases = sum(cases),
	deaths = sum(deaths),
	Population = sum(Population),
	.groups = "drop"
	) %>%
	mutate(deaths_per_mill = deaths * 1000000 / Population) %>%
	select(Country_Region, date, cases, deaths, deaths_per_mill, Population)

	# Display the first few rows
	head(US_totals) %>% kable(caption = "US COVID-19 Totals (First 6 rows)")
	```

	# Data Visualization

	## US COVID-19 Cases and Deaths Over Time

	```{r us-covid-plot, fig.cap="COVID-19 Cases and Deaths in the United States (Log Scale)"}
	US_totals %>%
	filter(cases > 0) %>%
	ggplot(aes(x = date)) +
	geom_line(aes(y = cases, color = "Cases"), size = 1) +
	geom_point(aes(y = cases, color = "Cases"), alpha = 0.7) +
	geom_line(aes(y = deaths, color = "Deaths"), size = 1) +
	geom_point(aes(y = deaths, color = "Deaths"), alpha = 0.7) +
	scale_y_log10(labels = scales::comma_format()) +
	scale_color_manual(
	values = c("Cases" = "#2E86AB", "Deaths" = "#A23B72"),
	name = "Metric"
	) +
	theme_minimal() +
	theme(
	legend.position = "bottom",
	axis.text.x = element_text(angle = 45, hjust = 1),
	plot.title = element_text(size = 14, face = "bold"),
	plot.subtitle = element_text(size = 12, color = "gray60")
	) +
	labs(
	title = "COVID-19 Cases and Deaths in the United States",
	subtitle = "Cumulative totals on logarithmic scale",
	x = "Date",
	y = "Count (Log Scale)",
	caption = "Data source: Johns Hopkins CSSE COVID-19 Data Repository"
	)
	```

	```{r ny-covid-plot, fig.cap="COVID-19 Cases and Deaths in NY state (Log Scale)"}
	state <- "New York"
	US_by_state %>%
	filter(Province_State == state) %>%
	filter(cases > 0) %>%
	ggplot(aes(x = date, y = cases)) +
	geom_line(aes(y = cases, color = "Cases"), size = 1) +
	geom_point(aes(color = "Cases"), alpha = 0.7) +
	geom_line(aes(y = deaths, color = "Deaths"), size = 1) +
	geom_point(aes(y = deaths, color = "Deaths"), alpha = 0.7) +
	scale_y_log10() +
	theme(
	legend.position = "bottom",
	axis.text.x = element_text(angle = 90, hjust = 1)) +
	labs(title = str_c("COVID-19 Cases and Deaths in ", state), y = NULL)

	```

	Let's first examine the data range and totals:

	```{r data-summary}
	# Check the date range and maximum values
	cat("Latest date in dataset:", as.character(max(US_totals$date)), "\n")
	cat("Maximum total deaths:", format(max(US_totals$deaths), big.mark = ","), "\n")
	```

	## Calculate New Daily Cases and Deaths

	We'll calculate the daily new cases and deaths by taking the difference from the previous day:

	```{r calculate-new-cases}
	# Calculate new daily cases and deaths for state data
	US_by_state <- US_by_state %>%
	arrange(Province_State, date) %>%
	group_by(Province_State) %>%
	mutate(
	new_cases = cases - lag(cases, default = 0),
	new_deaths = deaths - lag(deaths, default = 0)
	) %>%
	ungroup()

	# Calculate new daily cases and deaths for national totals
	US_totals <- US_totals %>%
	arrange(date) %>%
	mutate(
	new_cases = cases - lag(cases, default = 0),
	new_deaths = deaths - lag(deaths, default = 0)
	)
	```

	## National Trends: Daily New Cases and Deaths

	```{r national-trends, fig.cap="Daily new COVID-19 cases and deaths in the United States"}
	US_totals %>%
	filter(date >= as.Date("2020-03-01")) %>% # Start from March 2020
	ggplot(aes(x = date)) +
	geom_line(aes(y = new_cases, color = "New Cases"), size = 1) +
	geom_point(aes(y = new_cases, color = "New Cases"), alpha = 0.6) +
	geom_line(aes(y = new_deaths, color = "New Deaths"), size = 1) +
	geom_point(aes(y = new_deaths, color = "New Deaths"), alpha = 0.6) +
	scale_y_log10(labels = scales::comma_format()) +
	scale_color_manual(
	values = c("New Cases" = "#2E86AB", "New Deaths" = "#A23B72"),
	name = "Metric"
	) +
	theme_minimal() +
	theme(
	legend.position = "bottom",
	axis.text.x = element_text(angle = 45, hjust = 1),
	plot.title = element_text(size = 14, face = "bold")
	) +
	labs(
	title = "COVID-19 Daily New Cases and Deaths in the US",
	subtitle = "Logarithmic scale showing pandemic waves",
	x = "Date",
	y = "Count (Log Scale)",
	caption = "Data source: Johns Hopkins CSSE COVID-19 Data Repository"
	)
	```

	## State-Level Analysis: New York

	Let's examine the trends for a specific state. We'll use New York as an example since it was heavily impacted early in the pandemic:

	```{r ny-state-analysis, fig.cap="Daily new COVID-19 cases and deaths in New York State"}
	# Define the state for analysis
	state <- "New York"

	US_by_state %>%
	filter(
	Province_State == state,
	date >= as.Date("2020-03-01"),
	new_cases >= 0 # Filter out negative values from data corrections
	) %>%
	ggplot(aes(x = date)) +
	geom_line(aes(y = new_cases, color = "New Cases"), size = 1) +
	geom_point(aes(y = new_cases, color = "New Cases"), alpha = 0.6) +
	geom_line(aes(y = new_deaths, color = "New Deaths"), size = 1) +
	geom_point(aes(y = new_deaths, color = "New Deaths"), alpha = 0.6) +
	scale_y_log10(labels = scales::comma_format()) +
	scale_color_manual(
	values = c("New Cases" = "#2E86AB", "New Deaths" = "#A23B72"),
	name = "Metric"
	) +
	theme_minimal() +
	theme(
	legend.position = "bottom",
	axis.text.x = element_text(angle = 45, hjust = 1),
	plot.title = element_text(size = 14, face = "bold")
	) +
	labs(
	title = str_c("COVID-19 Daily New Cases and Deaths in ", state),
	subtitle = "Logarithmic scale showing state-level pandemic trends",
	x = "Date",
	y = "Count (Log Scale)",
	caption = "Data source: Johns Hopkins CSSE COVID-19 Data Repository"
	)
	```

	## State Totals and Per-Capita Analysis

	Now let's create a summary table with total cases and deaths by state, along with per-capita metrics:

	```{r state-totals}
	US_state_totals <- US_by_state %>%
	group_by(Province_State) %>%
	summarize(
	deaths = max(deaths, na.rm = TRUE),
	cases = max(cases, na.rm = TRUE),
	population = max(Population, na.rm = TRUE),
	cases_per_thou = 1000 * cases / population,
	deaths_per_thou = 1000 * deaths / population,
	.groups = "drop"
	) %>%
	filter(cases > 0, population > 0)

	# Display summary statistics
	cat("State-level summary statistics:\n")
	cat("Number of states/territories:", nrow(US_state_totals), "\n")
	cat("Total US cases:", format(sum(US_state_totals$cases), big.mark = ","), "\n")
	cat("Total US deaths:", format(sum(US_state_totals$deaths), big.mark = ","), "\n")
	```

	# State-Level Comparative Analysis

	## Top 10 States by Cases and Deaths per Thousand

	```{r top-states-analysis}
	# Top 10 states by cases per thousand
	top_cases <- US_state_totals %>%
	arrange(desc(cases_per_thou)) %>%
	head(10) %>%
	select(Province_State, cases_per_thou, deaths_per_thou, population)

	# Top 10 states by deaths per thousand
	top_deaths <- US_state_totals %>%
	arrange(desc(deaths_per_thou)) %>%
	head(10) %>%
	select(Province_State, cases_per_thou, deaths_per_thou, population)

	cat("Top 10 States by Cases per Thousand:\n")
	print(top_cases)

	cat("\nTop 10 States by Deaths per Thousand:\n")
	print(top_deaths)
	```

	## Statistical Summary and Key Insights

	```{r statistical-summary}
	# National statistics
	national_stats <- US_totals %>%
	filter(date == max(date)) %>%
	slice(1)

	# State statistics
	state_stats <- US_state_totals %>%
	summarize(
	total_states = n(),
	avg_cases_per_thou = mean(cases_per_thou),
	median_cases_per_thou = median(cases_per_thou),
	avg_deaths_per_thou = mean(deaths_per_thou),
	median_deaths_per_thou = median(deaths_per_thou),
	)

	cat("=== NATIONAL SUMMARY ===\n")
	cat("Total Cases:", format(national_stats$cases, big.mark = ","), "\n")
	cat("Total Deaths:", format(national_stats$deaths, big.mark = ","), "\n")
	cat("Overall CFR:", round(national_stats$deaths / national_stats$cases * 100, 2), "%\n")
	cat("Deaths per Million:", round(national_stats$deaths_per_mill, 2), "\n\n")

	cat("=== STATE-LEVEL SUMMARY ===\n")
	cat("Number of States/Territories:", state_stats$total_states, "\n")
	cat("Average Cases per Thousand:", round(state_stats$avg_cases_per_thou, 2), "\n")
	cat("Median Cases per Thousand:", round(state_stats$median_cases_per_thou, 2), "\n")
	cat("Average Deaths per Thousand:", round(state_stats$avg_deaths_per_thou, 2), "\n")
	cat("Median Deaths per Thousand:", round(state_stats$median_deaths_per_thou, 2), "\n")

	```

	## Summary of Findings

	This analysis of COVID-19 data from the Johns Hopkins CSSE repository reveals significant insights about the pandemic's impact across the United States. The examination of both national trends and state-level variations demonstrates the heterogeneous nature of pandemic spread across different populations and geographies. The United States experienced multiple distinct waves of COVID-19 infections, with clear peaks and valleys in both case and death counts effectively captured through logarithmic scale visualizations. Significant variation exists across states in terms of cases and deaths per thousand population, likely reflecting differences in population density, demographics, healthcare infrastructure, and policy responses. The correlation between cases per capita and deaths per capita across states suggests that while some variation exists in case fatality rates, the fundamental relationship between infection spread and mortality remains consistent at the population level.

	## Limitations and Sources of Bias

	The primary limitation of this analysis stems from temporal bias in interpretation. Analyzing COVID-19 data in 2025 with full knowledge of how the pandemic evolved creates an inherent bias in interpreting early trends through the lens of later developments. This retrospective knowledge may lead to over-interpretation of patterns that seemed less significant at the time, or conversely, may cause us to undervalue the uncertainty and fear that characterized early pandemic decision-making. Additionally, the data itself contains reporting delays, weekend effects, and evolving case definitions that affect the reliability of day-to-day comparisons.

	## Personal Bias and Mitigation

	As an analyst examining this data with the benefit of hindsight, I carry temporal bias that influences my interpretation of early pandemic trends. Knowing the eventual trajectory of cases, deaths, and policy responses, I may unconsciously interpret initial data patterns as more predictable or logical than they actually were at the time. To mitigate this bias, I have focused on transparent methodology by clearly documenting all data processing steps and analytical choices, avoided making causal claims, and emphasized descriptive patterns rather than explanatory theories that might be influenced by hindsight knowledge.

	```{r session-info, include=FALSE}
	# Document the R environment for reproducibility
	sessionInfo()
	```
No results found