class: center, middle, inverse, title-slide .title[ # Introduction to Data Science ] .subtitle[ ## Week 8: Regression analysis ] .author[ ### Ugur Aytun ] .institute[ ### METU, Department of Economics | ECON 413 ] --- --- # Maps -- - I show you the regression analysis in the next week with a brief introduction to the topic. -- - Regression analysis is a statistical method used to understand the relationship between variables, often used for prediction and forecasting. -- - We can also infer causal relationships, but we need to be careful about the assumptions we make. -- - `fixest` package is used for regression analysis in R, which allows for efficient estimation of linear models with fixed effects. --- # Key concepts in regression analysis -- - Outcome variable: The variable we are trying to explain or predict. -- - Control variable: Additional variables included in the model to account for other factors that may influence the outcome variable. -- - We can also infer causal relationships, but we need to be careful about the assumptions we make. -- - Economic theory or intuition: The underlying economic principles or reasoning that guide the choice of variables in the regression model. --- # Random sample of data .scrollable[ ``` r # install.packages("fixest") and set the working directory #library('R.utils') library(data.table) ``` ``` ## Warning: package 'data.table' was built under R version 4.3.3 ``` ``` r library(sf) ``` ``` ## Warning: package 'sf' was built under R version 4.3.3 ``` ``` ## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE ``` ``` r library(ggplot2) ``` ``` ## Warning: package 'ggplot2' was built under R version 4.3.3 ``` ``` r library(maps) ``` ``` ## Warning: package 'maps' was built under R version 4.3.3 ``` ``` r library(scales) # for squish ``` ``` ## Warning: package 'scales' was built under R version 4.3.3 ``` ``` r library(dplyr) ``` ``` ## Warning: package 'dplyr' was built under R version 4.3.3 ``` ``` ## ## Attaching package: 'dplyr' ``` ``` ## The following objects are masked from 'package:data.table': ## ## between, first, last ``` ``` ## The following objects are masked from 'package:stats': ## ## filter, lag ``` ``` ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union ``` ``` r setwd("H:/My Drive/ECON413") # install.packages("fixest") library(fixest) ``` ``` ## Warning: package 'fixest' was built under R version 4.3.3 ``` ``` ## ## Attaching package: 'fixest' ``` ``` ## The following object is masked from 'package:scales': ## ## pvalue ``` ``` r library(data.table) # create a random dataset set.seed(123) n <- 1000 x1 <- rnorm(n) x2 <- rnorm(n) x3 <- rnorm(n) # create a data frame df <- data.frame(y = rnorm(n), x1 = x1, x2 = x2, x3 = x3) # create a data table df <- as.data.table(df) # create a data frame with a factor variable df <- data.frame(y = rnorm(n), x1 = x1, x2 = x2, x3 = x3, group = sample(1:10, n, replace = TRUE)) # regression with fixest reg1 <- feols(y # dependent variable ~ x1 + x2 + x3 # independent variables | group, # fixed effects data = df) # data frame summary(reg1) # summary of the regression ``` ``` ## OLS estimation, Dep. Var.: y ## Observations: 1,000 ## Fixed-effects: group: 10 ## Standard-errors: Clustered (group) ## Estimate Std. Error t value Pr(>|t|) ## x1 0.022390 0.020954 1.068540 0.3130976 ## x2 0.093577 0.024179 3.870241 0.0037874 ** ## x3 -0.018001 0.034001 -0.529424 0.6093203 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 0.991184 Adj. R2: 0.005989 ## Within R2: 0.010026 ``` ``` r # interpret the results # the coefficients of x1, x2, and x3 are the average effect of these variables on y # the fixed effects are the average effect of the each group (related to reference category) on y # tabualte the results with more information table_1 <- etable(reg1) table_1 ``` ``` ## reg1 ## Dependent Var.: y ## ## x1 0.0224 (0.0209) ## x2 0.0936** (0.0242) ## x3 -0.0180 (0.0340) ## Fixed-Effects: ----------------- ## group Yes ## _______________ _________________ ## S.E.: Clustered by: group ## Observations 1,000 ## R2 0.01793 ## Within R2 0.01003 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ``` ``` r # save as a latex format etable(reg1, file = "results/table_1.tex") # save as a tex format. # What is LaTeX? # add cluster. What is cluster in the regression analysis? reg1 <- feols(y # dependent variable ~ x1 + x2 + x3 # independent variables | group, # fixed effects data = df, cluster = c("group")) # data frame summary(reg1) # summary of the regression ``` ``` ## OLS estimation, Dep. Var.: y ## Observations: 1,000 ## Fixed-effects: group: 10 ## Standard-errors: Clustered (group) ## Estimate Std. Error t value Pr(>|t|) ## x1 0.022390 0.020954 1.068540 0.3130976 ## x2 0.093577 0.024179 3.870241 0.0037874 ** ## x3 -0.018001 0.034001 -0.529424 0.6093203 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 0.991184 Adj. R2: 0.005989 ## Within R2: 0.010026 ``` ] --- # Regression analysis with a real dataset .scrollable[ ``` r library(WDI) ``` ``` ## Warning: package 'WDI' was built under R version 4.3.3 ``` ``` r series <- WDI_data$series countries <- WDI_data$country # Revisiting the relationship between manufacturing industry and GDP per capita (Kaldor Law #1) dat2 <- WDI(indicator = c("NY.GDP.MKTP.KD.ZG", # GDP growth "NV.IND.MANF.KD.ZG"), # manufacturing growth country = c("all"), start = 1960, end = 2025) # without fixed effects reg1 <- feols(NY.GDP.MKTP.KD.ZG ~ # gdp growth NV.IND.MANF.KD.ZG, # manufacturing growth cluster = c("country"), # cluster data = dat2) ``` ``` ## NOTE: 8,712 observations removed because of NA values (LHS: 3,176, RHS: 8,701). ``` ``` r summary(reg1) ``` ``` ## OLS estimation, Dep. Var.: NY.GDP.MKTP.KD.ZG ## Observations: 8,578 ## Standard-errors: Clustered (country) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.120514 0.170906 18.25860 < 2.2e-16 *** ## NV.IND.MANF.KD.ZG 0.109454 0.039793 2.75057 0.0064355 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 4.99331 Adj. R2: 0.101456 ``` ``` r reg2 <- feols(NY.GDP.MKTP.KD.ZG ~ NV.IND.MANF.KD.ZG | country + year, # fixed effects cluster = c("country"), # cluster data = dat2) ``` ``` ## NOTE: 8,712 observations removed because of NA values (LHS: 3,176, RHS: 8,701). ``` ``` r summary(reg2) ``` ``` ## OLS estimation, Dep. Var.: NY.GDP.MKTP.KD.ZG ## Observations: 8,578 ## Fixed-effects: country: 225, year: 64 ## Standard-errors: Clustered (country) ## Estimate Std. Error t value Pr(>|t|) ## NV.IND.MANF.KD.ZG 0.08919 0.032405 2.7524 0.0064008 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 4.4943 Adj. R2: 0.246872 ## Within R2: 0.079021 ``` ``` r etable(reg1, reg2) ``` ``` ## reg1 reg2 ## Dependent Var.: NY.GDP.MKTP.KD.ZG NY.GDP.MKTP.KD.ZG ## ## Constant 3.121*** (0.1709) ## NV.IND.MANF.KD.ZG 0.1095** (0.0398) 0.0892** (0.0324) ## Fixed-Effects: ----------------- ----------------- ## country No Yes ## year No Yes ## _________________ _________________ _________________ ## S.E.: Clustered by: country by: country ## Observations 8,578 8,578 ## R2 0.10156 0.27216 ## Within R2 -- 0.07902 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ``` ] --- # Marginal propenity to consumption (MPC) and the consumption function .scrollable[ ``` r # marginal propensity to consume (MPC) # MPC is the change in consumption divided by the change in income # MPC = change in consumption / change in income # lets run a regression consumption_datasets <- WDIsearch(string = "consumption", field = "name", short = TRUE, cache = NULL) gdp_datasets <- WDIsearch(string = "gdp", field = "name", short = TRUE, cache = NULL) dat3 <- WDI(indicator = c("NE.CON.TOTL.KD", # Final consumption expenditure (constant 2015 US$) "NY.GDP.MKTP.KD"), # GDP country = c("all"), start = 1960, end = 2025) reg4 <- feols(log(NE.CON.TOTL.KD) ~ log(NY.GDP.MKTP.KD) | country + year, # fixed effects cluster = c("country"), # cluster data = dat3) ``` ``` ## NOTE: 8,477 observations removed because of NA values (LHS: 8,471, RHS: 3,056). ``` ``` r summary(reg4) ``` ``` ## OLS estimation, Dep. Var.: log(NE.CON.TOTL.KD) ## Observations: 8,813 ## Fixed-effects: country: 222, year: 65 ## Standard-errors: Clustered (country) ## Estimate Std. Error t value Pr(>|t|) ## log(NY.GDP.MKTP.KD) 0.826025 0.046713 17.6829 < 2.2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 0.183763 Adj. R2: 0.995192 ## Within R2: 0.479675 ``` ]