```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(margins)
library(psych)
library(QuantPsyc)
library(haven)
library(marginaleffects)
```

## BIVARIATE REGRESSION

load data first (we directly load .dta data here using haven package)

remember to change the path of the file to the path in your computer.

```{r}
sa_data <- read_dta("Data Files/ps2030.bank-salaries.dta")
```

### Generate Scatterplot of Salary and Education Level

```{r, fig.show="hold", out.width="49%"}

p=ggplot(sa_data, aes(x = edlevel, y = salnow)) +
  geom_jitter(width = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Scatter plot of Salary and Education Level",
       x = "Education Level",
       y = "Salary")
p+scale_x_continuous(breaks = seq(5,20,5))+scale_y_continuous(breaks = seq(0,60000,20000))
```

### Bivariate Regression

```{r}
lm_bivariate <- lm(salnow ~ edlevel, data = sa_data)
summary(lm_bivariate)
```

### Generate Predicted Values of Y (YHAT) and Residuals (e)

```{r}
sa_data$predsal <- predict(lm_bivariate)
sa_data$residsal <- residuals(lm_bivariate)
# manual version:
sa_data$manpred<--7332.471+1563.963*sa_data$edlevel
sa_data$manresid=sa_data$salnow-sa_data$manpred
# list some cases
head(sa_data[, c("salnow", "predsal", "manpred", "residsal", "manresid")], 20)
```

### Generate predicted values for specific X values, using margins package or you can also use marginal effect package

```{r}
margins::prediction(lm_bivariate, at =
list(edlevel = 12))

marg_8_12_20_21<-margins::prediction(lm_bivariate, at = list(edlevel = c(8, 12, 20, 21)))

marg_8_12_20_21

ggplot() +
  geom_point(data = marg_8_12_20_21, aes(x = edlevel, y = fitted)) +
  geom_line(data = marg_8_12_20_21, aes(x = edlevel, y = fitted)) +
  geom_ribbon(data = marg_8_12_20_21, aes(x = edlevel, ymin = fitted - 2*se.fitted, ymax = fitted + 2*se.fitted), alpha = 0.2, fill = "blue") +
  labs(title = "Predicted Values and Confidence Intervals for Different Education Levels",
       x = "Education level",
       y = "Linear prediction") +
  scale_x_continuous(breaks = c(8,12,20,21)) +
  scale_y_continuous(breaks = seq(5000,25000,5000))


```

Alternatively, we can also use marginaleffects package to get the same results.

```{r}
avg_predictions(lm_bivariate, variables = list(edlevel = 12))

avg_predictions(lm_bivariate, variables = list(edlevel = c(8, 12, 20, 21)))
```

### Descriptive statistics on Y, Yhat, and Residuals

```{r}
summary(sa_data[c("salnow", "predsal", "residsal")])
```

### Do the errors of prediction correlate with any other variable?

```{r}
tapply(sa_data$residsal, sa_data$sex, mean, na.rm = TRUE)
```

## MULTIPLE REGRESSION

```{r}
saf_data <- read_dta("Data Files/ps2030.safrica.dta")
# Summary statistics of 'know', 'times', and 'educ1'
summary_stats <- describe(saf_data[c("know", "times", "educ1")], trim=0)
print(summary_stats)

# Correlation matrix between 'know', 'times', and 'educ1'
cor_matrix <- cor(saf_data[c("know", "times", "educ1")])
print(cor_matrix)
```

### BIVARIATE

```{r}
lm_bivariate_know_times <- lm(know ~ times, data = saf_data)
summary(lm_bivariate_know_times)

lm_bivariate_know_educ1 <- lm(know ~ educ1, data = saf_data)
summary(lm_bivariate_know_educ1)
```

### Manual Multivariate Regression

```{r}
# coefficient for educ1
# Regression of know on times
model_know_times <- lm(know ~ times, data = saf_data)
summary(model_know_times)
yresidtimes <- residuals(model_know_times)

# Regression of educ1 on times
model_educ1_times <- lm(educ1 ~ times, data = saf_data)
summary(model_educ1_times)
xresidtimes <- residuals(model_educ1_times)
# Regression of yresidtimes on xresidtimes
model_yresid_xresid <- lm(yresidtimes ~ xresidtimes)
summary(model_yresid_xresid)
```

### BETA COEFFICIENTS AND CALCULATIONS

```{r}
lm_multivariate <- lm(know ~ times + educ1, data = saf_data)
summary(lm_multivariate)

# Extracting standardized coefficients (beta)
coef_lmbeta <- lm.beta(lm_multivariate)
coef_lmbeta
```

### F\* Tests and Model Building: HYPOTHETICAL 'SOCIAL BACKGROUND' VERSUS 'MOTIVATIONAL' MODEL

```{r}
# THE 'FULL' MODEL
lm_full_model <- lm(know ~ age1 + educ1 + church + interest + media2 + efficacy, data = saf_data)
summary(lm_full_model)
# REDUCED MODEL 1:  MOTIVATIONAL VARIABLES ONLY
lm_reduced_model1 <- lm(know ~ interest + media2 + efficacy, data = saf_data)
summary(lm_reduced_model1)
# REDUCED MODEL 2:  SOCIAL BACKGROUND VARIABLES ONLY
lm_reduced_model2 <- lm(know ~ age1 + educ1 + church, data = saf_data)
summary(lm_reduced_model2)
# F* Test for Social Background Model
anova(lm_reduced_model1, lm_full_model)

# F* Test for Motivational Model
anova(lm_reduced_model2, lm_full_model)

```

### BASIC RESIDUAL PLOTS

```{r}
# generates a new variable "residknow" that represents the residual from the multiple regression function
saf_data$residuals_know <- residuals(lm_multivariate)

# generates a new variable "predknow" that represents the predicted value of Y from the multiple regression function
saf_data$predicted_know <- predict(lm_multivariate)

# plot the Residuals against Predicted Values of Y
ggplot(saf_data, aes(x = predicted_know, y = residuals_know)) +
   geom_jitter(width = 0.5)  +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatter Plot of Residuals vs Predicted Values with Fitted Line",
       x = "Predicted Values",
       y = "Residuals") + 
  scale_x_continuous(breaks = seq(2,8,2)) +
  scale_y_continuous(breaks = seq(-5,5,5)) +
  theme_minimal()

# Residuals against one of the predictor X variables
ggplot(saf_data, aes(x = times, y = residuals_know)) +
   geom_jitter(width = 0.3)  +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatter Plot of Residuals vs Times with Fitted Line",
       x = "Times",
       y = "Residuals") + 
  scale_x_continuous(breaks = seq(0,4,1)) +
  scale_y_continuous(breaks = seq(-5,5,5)) +
  theme_minimal()

```

### Histogram and Normal Probability Plot of Residuals

```{r}

ggplot(saf_data, aes(x = residuals_know)) +
  geom_histogram(aes(y = ..density..), fill = "blue", color = "black", alpha = 0.7, breaks = seq(-6.1596866, 6, by = .41887331)) +
  labs(title = "Density Histogram of Residuals",
       x = "Residuals",
       y = "Density") +
    scale_x_continuous(breaks = seq(-5,5,5)) +
  scale_y_continuous(breaks = seq(0,0.3,0.1)) +
  theme_minimal()
qqnorm(saf_data$residuals_know)
qqline(saf_data$residuals_know)
```

### OUTLIERS AND LEVERAGE TESTS: LET'S USE THE COUNTRIES DATA HERE

```{r}
countries_data <- read_dta("Data Files/ps2030.countries_2002.dta")
ggplot(countries_data, aes(x = urban, y = life)) +
  geom_jitter(width = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Scatter plot of life and urban",
       x = "Percent urban",
       y = "life")+
  scale_x_continuous(breaks = seq(0,100,20))+
  scale_y_continuous(breaks = seq(40,80,10))

lm_life <- lm(life ~ urban, data = countries_data)
summary(lm_life)
countries_data$liferesid <- residuals(lm_life)
countries_data$lifepred <- predict(lm_life)

# look at normality
hist(countries_data$liferesid, breaks = seq(-29.083878, 20, by = 2.2321515), prob=TRUE, xlab = "Residuals" )
qqnorm(countries_data$liferesid)
qqline(countries_data$liferesid)


```

```{r}
# Standardized Residuals
countries_data$standresid <- countries_data$liferesid / sd(countries_data$liferesid)
ggplot(countries_data, aes(x = urban, y = standresid)) +
  geom_jitter(width = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Scatter plot of life and urban",
       x = "Percent urban",
       y = "standresid")+
  scale_x_continuous(breaks = seq(0,100,20))+
  scale_y_continuous(breaks = seq(-4,2,2))

# GIVES ALL THE LARGEST RESIDUALS FOR EXAMINATION
subset_data <-countries_data[abs(countries_data$standresid) > 2, c("country", "life", "lifepred", "liferesid", "standresid")]
print(subset_data)
```

## Note for Homework

To run a regression in R for only those cases that are not missing on any of the specified variables (var1 to var4), you can use the complete.cases() function to filter your dataset.

```{r, eval=F, echo=T}
# Assuming you have a dataframe named 'your_data' with variables 'depvar', 'var1', 'var2', 'var3', and 'var4'

# Create a subset of data containing only complete cases on var1 to var4
subset_data <- your_data[complete.cases(your_data$var1, your_data$var2, your_data$var3, your_data$var4), ]

# Run a regression of depvar on var1 and var2 using the subsetted data
lm_model <- lm(depvar ~ var1 + var2, data = subset_data)

# Print the regression summary
summary(lm_model)
```