class: center, middle, inverse, title-slide .title[ # Introduction to Data Science ] .subtitle[ ## Week 8: Regression analysis ] .author[ ### Ugur Aytun ] .institute[ ### METU, Department of Economics | ECON 413 ] --- --- # Maps -- - I show you the regression analysis in the next week with a brief introduction to the topic. -- - Regression analysis is a statistical method used to understand the relationship between variables, often used for prediction and forecasting. -- - We can also infer causal relationships, but we need to be careful about the assumptions we make. -- - `fixest` package is used for regression analysis in R, which allows for efficient estimation of linear models with fixed effects. --- # Key concepts in regression analysis -- - Outcome variable: The variable we are trying to explain or predict. -- - Control variable: Additional variables included in the model to account for other factors that may influence the outcome variable. -- - We can also infer causal relationships, but we need to be careful about the assumptions we make. -- - Economic theory or intuition: The underlying economic principles or reasoning that guide the choice of variables in the regression model. --- # Random sample of data .scrollable[ ``` r # install.packages("fixest") and set the working directory #library('R.utils') library(data.table) ``` ``` ## Warning: package 'data.table' was built under R version 4.3.3 ``` ``` r library(sf) ``` ``` ## Warning: package 'sf' was built under R version 4.3.3 ``` ``` ## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE ``` ``` r library(ggplot2) ``` ``` ## Warning: package 'ggplot2' was built under R version 4.3.3 ``` ``` r library(maps) ``` ``` ## Warning: package 'maps' was built under R version 4.3.3 ``` ``` r library(scales) # for squish ``` ``` ## Warning: package 'scales' was built under R version 4.3.3 ``` ``` r library(dplyr) ``` ``` ## Warning: package 'dplyr' was built under R version 4.3.3 ``` ``` ## ## Attaching package: 'dplyr' ``` ``` ## The following objects are masked from 'package:data.table': ## ## between, first, last ``` ``` ## The following objects are masked from 'package:stats': ## ## filter, lag ``` ``` ## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union ``` ``` r setwd("H:/My Drive/ECON413") # install.packages("fixest") library(fixest) ``` ``` ## Warning: package 'fixest' was built under R version 4.3.3 ``` ``` ## ## Attaching package: 'fixest' ``` ``` ## The following object is masked from 'package:scales': ## ## pvalue ``` ``` r library(data.table) # create a random dataset set.seed(123) n <- 1000 x1 <- rnorm(n) x2 <- rnorm(n) x3 <- rnorm(n) # create a data frame df <- data.frame(y = rnorm(n), x1 = x1, x2 = x2, x3 = x3) # create a data table df <- as.data.table(df) # create a data frame with a factor variable df <- data.frame(y = rnorm(n), x1 = x1, x2 = x2, x3 = x3, group = sample(1:10, n, replace = TRUE)) # regression with fixest reg1 <- feols(y # dependent variable ~ x1 + x2 + x3 # independent variables | group, # fixed effects data = df) # data frame summary(reg1) # summary of the regression ``` ``` ## OLS estimation, Dep. Var.: y ## Observations: 1,000 ## Fixed-effects: group: 10 ## Standard-errors: Clustered (group) ## Estimate Std. Error t value Pr(>|t|) ## x1 0.022390 0.020954 1.068540 0.3130976 ## x2 0.093577 0.024179 3.870241 0.0037874 ** ## x3 -0.018001 0.034001 -0.529424 0.6093203 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 0.991184 Adj. R2: 0.005989 ## Within R2: 0.010026 ``` ``` r # interpret the results # the coefficients of x1, x2, and x3 are the average effect of these variables on y # the fixed effects are the average effect of the each group (related to reference category) on y # tabualte the results with more information table_1 <- etable(reg1) table_1 ``` ``` ## reg1 ## Dependent Var.: y ## ## x1 0.0224 (0.0209) ## x2 0.0936** (0.0242) ## x3 -0.0180 (0.0340) ## Fixed-Effects: ----------------- ## group Yes ## _______________ _________________ ## S.E.: Clustered by: group ## Observations 1,000 ## R2 0.01793 ## Within R2 0.01003 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ``` ``` r # save as a latex format etable(reg1, file = "results/table_1.tex") # save as a tex format. # What is LaTeX? # add cluster. What is cluster in the regression analysis? reg1 <- feols(y # dependent variable ~ x1 + x2 + x3 # independent variables | group, # fixed effects data = df, cluster = c("group")) # data frame summary(reg1) # summary of the regression ``` ``` ## OLS estimation, Dep. Var.: y ## Observations: 1,000 ## Fixed-effects: group: 10 ## Standard-errors: Clustered (group) ## Estimate Std. Error t value Pr(>|t|) ## x1 0.022390 0.020954 1.068540 0.3130976 ## x2 0.093577 0.024179 3.870241 0.0037874 ** ## x3 -0.018001 0.034001 -0.529424 0.6093203 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 0.991184 Adj. R2: 0.005989 ## Within R2: 0.010026 ``` ] --- # Regression analysis with a real dataset .scrollable[ ``` r library(WDI) ``` ``` ## Warning: package 'WDI' was built under R version 4.3.3 ``` ``` r series <- WDI_data$series countries <- WDI_data$country # Revisiting the relationship between manufacturing industry and GDP per capita (Kaldor Law #1) dat2 <- WDI(indicator = c("NY.GDP.MKTP.KD.ZG", # GDP growth "NV.IND.MANF.KD.ZG"), # manufacturing growth country = c("all"), start = 1960, end = 2025) # without fixed effects reg1 <- feols(NY.GDP.MKTP.KD.ZG ~ # gdp growth NV.IND.MANF.KD.ZG, # manufacturing growth cluster = c("country"), # cluster data = dat2) ``` ``` ## NOTE: 8,993 observations removed because of NA values (LHS: 3,407, RHS: 8,963). ``` ``` r summary(reg1) ``` ``` ## OLS estimation, Dep. Var.: NY.GDP.MKTP.KD.ZG ## Observations: 8,297 ## Standard-errors: Clustered (country) ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 3.126700 0.177034 17.66160 < 2.2e-16 *** ## NV.IND.MANF.KD.ZG 0.109966 0.041039 2.67957 0.0079241 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 4.92829 Adj. R2: 0.10488 ``` ``` r reg2 <- feols(NY.GDP.MKTP.KD.ZG ~ NV.IND.MANF.KD.ZG | country + year, # fixed effects cluster = c("country"), # cluster data = dat2) ``` ``` ## NOTE: 8,993 observations removed because of NA values (LHS: 3,407, RHS: 8,963). ``` ``` r summary(reg2) ``` ``` ## OLS estimation, Dep. Var.: NY.GDP.MKTP.KD.ZG ## Observations: 8,297 ## Fixed-effects: country: 223, year: 63 ## Standard-errors: Clustered (country) ## Estimate Std. Error t value Pr(>|t|) ## NV.IND.MANF.KD.ZG 0.088246 0.0329 2.68224 0.007863 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 4.43211 Adj. R2: 0.250382 ## Within R2: 0.07942 ``` ``` r etable(reg1, reg2) ``` ``` ## reg1 reg2 ## Dependent Var.: NY.GDP.MKTP.KD.ZG NY.GDP.MKTP.KD.ZG ## ## Constant 3.127*** (0.1770) ## NV.IND.MANF.KD.ZG 0.1100** (0.0410) 0.0882** (0.0329) ## Fixed-Effects: ----------------- ----------------- ## country No Yes ## year No Yes ## _________________ _________________ _________________ ## S.E.: Clustered by: country by: country ## Observations 8,297 8,297 ## R2 0.10499 0.27613 ## Within R2 -- 0.07942 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ``` ] --- # Marginal propenity to consumption (MPC) and the consumption function .scrollable[ ``` r # marginal propensity to consume (MPC) # MPC is the change in consumption divided by the change in income # MPC = change in consumption / change in income # lets run a regression consumption_datasets <- WDIsearch(string = "consumption", field = "name", short = TRUE, cache = NULL) gdp_datasets <- WDIsearch(string = "gdp", field = "name", short = TRUE, cache = NULL) dat3 <- WDI(indicator = c("NE.CON.TOTL.KD", # Final consumption expenditure (constant 2015 US$) "NY.GDP.MKTP.KD"), # GDP country = c("all"), start = 1960, end = 2025) reg4 <- feols(log(NE.CON.TOTL.KD) ~ log(NY.GDP.MKTP.KD) | country + year, # fixed effects cluster = c("country"), # cluster data = dat3) ``` ``` ## NOTE: 8,637 observations removed because of NA values (LHS: 8,631, RHS: 3,287). ``` ``` r summary(reg4) ``` ``` ## OLS estimation, Dep. Var.: log(NE.CON.TOTL.KD) ## Observations: 8,653 ## Fixed-effects: country: 222, year: 64 ## Standard-errors: Clustered (country) ## Estimate Std. Error t value Pr(>|t|) ## log(NY.GDP.MKTP.KD) 0.826403 0.045027 18.3534 < 2.2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## RMSE: 0.168756 Adj. R2: 0.99595 ## Within R2: 0.515541 ``` ]