########################## PREMISES ##########################
###Step Minus One###
#Remember to comment as much as possible.
#You should do that more for yourself than for the assignment.
#To remember what has been done in a script is not easy and, maybe, this script would be useful for you even a couple of years from now
###Step Zero###
#Understanding the windows: script, console, environment, help,...
2+1
#try to put the same command inside the console.
#What is the difference?
#What you do directly in the console is not saved!
#(Remember to save often your script. A few things in life are more )
###First Step###
#Some basics
#Creating a vector
# character-entry
x <- c("red", "blue", "green", "no color")
rm(x) #to remove the object "x"
# numeric-entry
a=c(1,2,3) #same for a<-c(1,2,3)
a
a=as.character(a) #We are saying to R that the entries have to be considered characters
a=as.numeric(a) #We are saying to R that the entries have to be considered numbers
b=c(4,5,6)
b
c=c(7,8,9)
c
#Creating a matrix
A1=c(a,b,c)
A1
A2=matrix(c(a,b,c),nrow=3,ncol=3)
A2
A3=rbind(a,b,c)
A3
A4=cbind(a,b,c)
A4
###Second Step###
#Setting a directory
help(setwd)
# to remove all objects
rm(list = ls())
setwd("C:\Users\Massimo Ceraolo\Desktop\Principles_of_economics") #this syntax does not work because2 "\" is considered a command
######
setwd("C:\\Users\\Massimo Ceraolo\\Desktop\\Principles_of_economics") #we can solve this either with a double backward slash
######
setwd("C:/Users/Massimo Ceraolo/Desktop/Principles_of_economics") #or this a forward slash
getwd()
###Third Step###
#Importing a dataset
install.packages(c("tidyverse","readr")) #you need to installa package just once (to install a package you need an Internet connection)
library(tidyverse) #the command "library" ask to R Studio to add to the CURRENT session the commands added by that package.
library(readr) #You need to repeat this command every time you re-open R Studio
tempdata <- read.csv("NH.Ts+dSST.csv", skip = 1, na.strings = "***") #"tempdata" is the name we have chosen for the imported the dataset.
#You can choose whatever name you want. I suggest you to choose something short but that helps you to remember the type of data that are inside.
#Try to import the dataset with the user interface (Inside the Environment window there is "Import Dataset")
#Is R recognising correctly the data? Check it immediatly opening the dataset or using the following commands
head(tempdata)
str(tempdata)
########################## PART 1.1 ##########################
###Drawning a graph line
tempdata$Jan <- ts(tempdata$Jan, start = c(1880), end = c(2017), frequency = 1)
#It does not work. What should we do?
#Let us give a look at the data!
#Our time series goes beyond the 2017
#Let us drop a line
#general command: mydataframe[-c(row_index_1, row_index_2),]
newdata=tempdata[-c(143),] #because we want to drop just the line n.143
tempdata=tempdata[-c(143),] #There is no need to use different names for changes in the dataset (sometimes it helps, sometomes is adds confusion)
rm(newdata)
tempdata$Jan <- ts(tempdata$Jan, start = c(1880), end = c(2021), frequency = 1)
tempdata$DJF <- ts(tempdata$DJF,
start = c(1880), end = c(2021), frequency = 1)
tempdata$MAM <- ts(tempdata$MAM,
start = c(1880), end = c(2021), frequency = 1)
tempdata$JJA <- ts(tempdata$JJA,
start = c(1880), end = c(2021), frequency = 1)
tempdata$SON <- ts(tempdata$SON,
start = c(1880), end = c(2021), frequency = 1)
tempdata$J.D <- ts(tempdata$J.D,
start = c(1880), end = c(2021), frequency = 1)
# Set line width and colour
plot(tempdata$Jan, type = "l", col = "blue", lwd = 2, ylab = "Annual temperature anomalies", xlab = "Year")
#let's put the year on the x axis
plot(x=tempdata$Year,y=tempdata$Jan,, type = "l", col = "blue", lwd = 2, ylab = "Annual temperature anomalies", xlab = "Year")
#Try some variations! https://www.rdocumentation.org/packages/graphics/versions/3.6.2/topics/plot
plot(tempdata$Year,tempdata$Jan, type = "l", col = "green", lwd = 5, ylab = "Annual temperature anomalies", xlab = "Year")
plot(tempdata$Year,tempdata$Jan, type = "p", col = "green", lwd = 5, ylab = "Annual temperature anomalies", xlab = "Year")
#In R, colors can be specified either by name (e.g col = "red") or as a hexadecimal RGB triplet (such as col = "#FFCC00").
#You can also use other color systems such as ones taken from the RColorBrewer package.
#https://html-color-codes.com/
plot(tempdata$Year,tempdata$Jan, type = "p", col = "#009999", lwd = 5, ylab = "Annual temperature anomalies", xlab = "Year")
# Add a title
title("Average temperature anomaly in January in the northern hemisphere (1880-2016)")
# Add a horizontal line (at y = 0)
abline(h = 0, col = "darkorange2", lwd = 2)
# Add a label to the horizontal line
text(2000, -0.1, "1951-1980 average")
########################## PART 1.2 ##########################
###Creating an histogram
tempdata$Period <- factor(NA, levels = c("1921-1950", "1951-1980", "1981-2010"), ordered = TRUE)
#Look at the dataset!
#What is a factor variable in R Language? https://www.stat.berkeley.edu/~s133/factors.html
tempdata$Period[(tempdata$Year > 1920) & (tempdata$Year < 1951)] <- "1921-1950"
tempdata$Period[(tempdata$Year > 1950) & (tempdata$Year < 1981)] <- "1951-1980"
tempdata$Period[(tempdata$Year > 1980) & (tempdata$Year < 2011)] <- "1981-2010"
#Look at the dataset!
tempdata$prova <- tempdata$Jan + tempdata$Feb
tempdata = subset(tempdata, select = -c(prova))
temp_summer <- c(tempdata$Jun, tempdata$Jul, tempdata$Aug)
temp_summer <- unlist(tempdata[,7:9],use.names = FALSE)
temp_Period <- c(tempdata$Period, tempdata$Period, tempdata$Period)
temp_Period <- factor(temp_Period, levels = 1:nlevels(tempdata$Period),labels = levels(tempdata$Period))
hist(temp_summer[(temp_Period == "1951-1980")], plot = FALSE)
#install.packages("mosaic")
library(mosaic)
histogram(~ temp_summer | temp_Period, type = "count",
breaks = seq(-0.5, 1.3, 0.10),
main = "Histogram of Temperature anomalies",
xlab = "Summer temperature distribution")
###Using the "quantile" function
# Select years 1951 to 1980
temp_all_months <- subset(tempdata, (Year >= 1951 & Year <= 1980))
# Columns 2 to 13 contain months Jan to Dec.
temp_51to80 <- unlist(temp_all_months[, 2:13])
temp_51to80
# c(0.3, 0.7) indicates the chosen percentiles.
perc <- quantile(temp_51to80, c(0.3, 0.7))
perc
# The cold threshold
p30 <- perc[1]
p30
# The hot threshold
p70 <- perc[2]
p70
###Using the "mean" function
mean(c(2,3,4))
temp <- temp_51to80 <p30
temp
mean(temp) #it interprets TRUE=1 and FALSE=0
###Calculating and understanding mean and variance
paste("Mean of DJF temperature anomalies across periods")
mean(~DJF|Period,data = tempdata)
#Why this command has this peculiar structure?
#It is due to the mosaic package
#First of all: DO NOT PANIC. In R there are many many different ways to do the same thing
#Second of all: each package has its explanation paper: https://journal.r-project.org/archive/2017/RJ-2017-024/RJ-2017-024.pdf
#here we can read: "Formulas with a single variable correspond to situations where there is no "explanatory" variable
#to be included, for example in simple numerical summaries or depictions of the distribution of a
#variable" (page 81, keyword "mean")
paste("Variance of DJF anomalies across periods")
var(~DJF|Period,data = tempdata)
###Putting 2 time series in the same plot
plot(tempdata$Year,tempdata$DJF, type = "l", col = "blue", lwd = 2,
ylab = "Annual temperature anomalies", xlab = "Year")
# \n creates a line break
title("Average temperature anomaly in DJF and JJA \n in the northern hemisphere (1880-2016)")
# Add a horizontal line (at y = 0)
abline(h = 0, col = "green", lwd = 2)
lines(tempdata$Year,tempdata$JJA, col = "orange", lwd = 2)
# Add a label to the horizontal line
text(1895, 0.1, "1951-1980 average",)
legend(1880, 1.5, legend = c("DJF", "JJA"), col = c("blue", "orange"), lty = 1, cex = 0.8, lwd = 2)
########################## PART 1.3 ##########################
### Scatterplots and the correlation coefficient
#install.packages("readxl")
library(readxl)
CO2data <- read_excel("1_CO2-data.xlsx")
CO2data_june <- CO2data[CO2data$Month == 6,]
?merge
tempCO2data <- merge(tempdata, CO2data_june)
head(tempCO2data)
head(tempCO2data[, c("Year", "Jun", "Trend")])
plot(tempCO2data$Jun, tempCO2data$Trend,
xlab = "Temperature anomaly (degrees Celsius)",
ylab = "CO2 levels (trend, mole fraction)",
pch = 16, col = "blue")
title("Scatterplot for CO2 emissions and temperature anomalies")
cor(tempCO2data$Jun, tempCO2data$Trend)
str(tempCO2data)
plot(tempCO2data$Jun, type = "l", col = "blue", lwd = 2,
ylab = "June temperature anomalies", xlab = "Year")
tempCO2data$Jun <- ts(tempCO2data$Jun, start = c(1958), end = c(2017), frequency = 1)
tempCO2data$Trend <- ts(tempCO2data$Trend, start = c(1958), end = c(2017), frequency = 1)
str(tempCO2data)
plot(tempCO2data$Jun, type = "l", col = "blue", lwd = 2,
ylab = "June temperature anomalies", xlab = "Year")
title("June temperature anomalies and CO2 emissions")
### A graph with double Y axis
# Create extra margins used for the second axis
par(mar = c(5, 5, 2, 5))
plot(tempCO2data$Jun, type = "l", col = "blue", lwd = 2,
ylab = "June temperature anomalies", xlab = "Year")
title("June temperature anomalies and CO2 emissions")
# This puts the next plot into the same picture.
par(new = T)
# No axis, no labels
plot(tempCO2data$Trend, pch = 16, lwd = 2,
axes = FALSE, xlab = NA, ylab = NA, cex = 1.2)
axis(side = 4)
mtext(side = 4, line = 3, 'CO2 emissions')
legend("topleft", legend = c("June temp anom", "CO2 emis"),
lty = c(1, 1), col = c("blue", "black"), lwd = 2)
*From progressive taxation to DIT - Complete DID2S Analysis for Academic Paper
clear all
set more off
*ssc install did2s, replace
*ssc install coefplot, replace
*ssc install estout, replace
set scheme plotplain, permanently
import excel "C:\Users\Michele\Desktop\DIT paper\Dataset_DIT_relaxed.xlsx", sheet("Sheet1") firstrow clear
sum
********************************** DATA PREPARATION *****************************
*** Premises for DID2S
** Generating Gvar or Cohort (same as CS)
tempvar aux
bysort country_id:egen `aux'=min(year) if tax_reform_dummy==1
bysort country_id:egen gvar=max(`aux')
replace gvar=0 if gvar==.
** Creating event study variables
* This creates the event dummies
gen event=year-gvar
* but shifted because stata doesnt like "negative" factor variables
replace event = event+28
* And we set the never treated as 0
replace event=0 if gvar==0
** Also label them to easy recognize the dynamic effects
forvalues i = 1/56 {
label define event `i' "T`=`i'-28'", modify
}
label define event 0 "base", modify
label values event event
** restricting event2 to window (-5,5) plus never treated
clonevar event2=event if event==0 | inrange(event-28,-5,5)
// START COMPREHENSIVE LOGGING
log using "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\DID2S_academic_analysis_w5.txt", text replace
display "================================================================="
display "COMPLETE DID2S ANALYSIS FOR ACADEMIC PUBLICATION"
display "================================================================="
display "Study: The Effect of Dual Post_tax_income Taxation on Post_tax_income Inequality"
display "Method: Two-Stage Difference-in-Differences (Gardner 2022)"
display "Sample: 15 OECD countries, 1980-2019"
display "Treatment: Introduction of Dual Post_tax_income Tax systems"
display "================================================================="
*************************** CONTROL GROUPS DEFINITION ***********************
display "================================================================="
display "THEORETICAL FRAMEWORK FOR CONTROL VARIABLES"
display "================================================================="
// Based on your H1-H5 theoretical framework
global controls_cycle "real_gdp_growth_pct_IMF unemployment_rate_IMF cpi_inflation_pct_IMF"
display "H1 - Business Cycle: $controls_cycle"
global controls_development "gdp_pc_real_WDI human_capital_idx_PWT population_mil_PWT"
display "H2 - Development Level: $controls_development"
global controls_production "labour_share_PWT tfp_index_PWT capital_services_idx_PWT avg_hours_worked_PWT"
display "H3 - Production Structure: $controls_production"
global controls_capital "real_internal_return_pct_PWT depreciation_rate_pct_PWT investment_pct_gdp_WDI"
display "H4 - Capital Markets: $controls_capital"
global controls_external "current_account_pct_gdp_IMF trade_open_WDI"
display "H5 - External Balance: $controls_external"
// Define main specification (for Table 1)
global main_controls "$controls_cycle $controls_development $controls_production"
display ""
display "MAIN SPECIFICATION (Table 1): $main_controls"
display "Theoretical justification: Captures business cycle effects, development level,"
display "and production structure - key channels affecting post_tax_income distribution"
*************************** OUTCOME VARIABLES ******************************
display "================================================================="
display "OUTCOME VARIABLES DEFINITION"
display "================================================================="
// Primary outcomes for main tables
global primary_outcomes "post_tax_top10 post_tax_top5 post_tax_top1"
global primary_names "Top 10%" "Top 5%" "Top 1%"
// Secondary outcomes for robustness/appendix
global secondary_outcomes "post_tax_next9 post_tax_next4 post_tax_bottom50 post_tax_bottom90"
// Ratio outcomes
global ratio_outcomes "post_tax_ratio_t10_b50 post_tax_ratio_t10_b90 post_tax_ratio_t1_b50 post_tax_ratio_t1_b90 post_tax_ratio_n9_b50 post_tax_ratio_n9_b90 post_tax_ratio_n4_b50 post_tax_ratio_n4_b90"
// All outcomes combined
global all_outcomes "$primary_outcomes $secondary_outcomes $ratio_outcomes"
display "Primary outcomes (main tables): $primary_outcomes"
display "Secondary outcomes (appendix): $secondary_outcomes"
display "Ratio outcomes (inequality measures): $ratio_outcomes"
*************************** MAIN DID2S ANALYSIS - ALL OUTCOMES **************
******************** RUN MAIN SPECIFICATION FOR ALL OUTCOMES **************
display "================================================================="
display "MAIN DID2S ANALYSIS - ALL OUTCOMES"
display "================================================================="
display "Specification: Business cycle + Development + Production structure"
display "Running main specification for ALL outcomes first"
display "================================================================="
// Run main specification for ALL outcomes (primary, secondary, and ratios)
foreach outcome in $all_outcomes {
display ""
display "=== MAIN ANALYSIS: `outcome' ==="
display "Controls: $main_controls"
did2s `outcome', first_stage(i.country_id i.year log_gdp_pc_real_WDI Total_tax_revenue_perc_GDP_OECD labour_share_PWT population_mil_PWT cpi_inflation_pct_IMF real_internal_return_pct_PWT unemployment_rate_IMF) ///
second_stage(i.event2) treatment(tax_reform_dummy) cluster(country_id)
estimates store main_`outcome'
display "Successfully stored: main_`outcome'"
}
*************************** KEY TREATMENT EFFECTS ***************************
******************** CALCULATE EFFECTS FOR PRIMARY OUTCOMES **************
display "================================================================="
display "KEY TREATMENT EFFECTS - PRIMARY OUTCOMES"
display "================================================================="
foreach outcome in $primary_outcomes {
display ""
display "=== KEY EFFECTS: `outcome' ==="
estimates restore main_`outcome'
// Calculate key treatment effects for reporting
display "Key treatment effects:"
// Year 0 effect (immediate)
display "Treatment year (T0): " %8.4f _b[28.event2] " (se=" %6.4f _se[28.event2] ")"
// Average short-term effect (T0 to T+2)
lincom (28.event2 + 29.event2 + 30.event2)/3
display "Short-term average (T0-T2): " %8.4f r(estimate) " (se=" %6.4f r(se) " p=" %5.3f r(p) ")"
// Average medium-term effect (T+3 to T+5)
lincom (31.event2 + 32.event2 + 33.event2)/3
display "Medium-term average (T3-T5): " %8.4f r(estimate) " (se=" %6.4f r(se) " p=" %5.3f r(p) ")"
// Overall post-treatment average
lincom (28.event2 + 29.event2 + 30.event2 + 31.event2 + 32.event2 + 33.event2)/6
display "Overall post-treatment: " %8.4f r(estimate) " (se=" %6.4f r(se) " p=" %5.3f r(p) ")"
}
*************************** TABLE 1: MAIN POST_TAX_RESULTS\ ***************************
display "================================================================="
display "CREATING TABLE 1: MAIN POST_TAX_RESULTS\"
display "================================================================="
// TABLE 1: Main Post_Tax_Results\ for Academic Paper
esttab main_post_tax_top10 main_post_tax_top5 main_post_tax_top1 ///
using "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\Table1_main_Post_Tax_Results_w5.tex", ///
replace booktabs ///
b(4) se(4) star(* 0.10 ** 0.05 *** 0.01) ///
title("Effect of Dual Post_tax_income Taxation on Top Post_tax_income Shares\label{tab:main_Post_Tax_Results\}") ///
mtitles("Top 10\%" "Top 5\%" "Top 1\%") ///
keep(*event2) ///
order(23.event2 24.event2 25.event2 26.event2 27.event2 28.event2 29.event2 30.event2 31.event2 32.event2 33.event2) ///
coeflabels(23.event2 "T-5" 24.event2 "T-4" 25.event2 "T-3" 26.event2 "T-2" 27.event2 "T-1" ///
28.event2 "T0" 29.event2 "T+1" 30.event2 "T+2" 31.event2 "T+3" 32.event2 "T+4" 33.event2 "T+5") ///
stats(N r2, fmt(0 3) labels("Observations" "R-squared")) ///
addnotes("Notes: This table reports coefficient estimates from two-stage difference-in-differences regressions." ///
"The dependent variables are post_tax_income shares held by the top 10\%, 5\%, and 1\% of earners." ///
"Treatment is the introduction of dual post_tax_income taxation. Event time T0 indicates the treatment year." ///
"All regressions include country and year fixed effects, and controls for business cycle," ///
"development level, and production structure variables. Standard errors are clustered by country." ///
"Significance levels: * p$<$0.10, ** p$<$0.05, *** p$<$0.01.")
// TABLE 1 Alternative: Summary coefficients only
esttab main_post_tax_top10 main_post_tax_top5 main_post_tax_top1 ///
using "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\Table1_summary_w5.tex", ///
replace booktabs ///
b(4) se(4) star(* 0.10 ** 0.05 *** 0.01) ///
title("Effect of Dual Post_tax_income Taxation on Top Post_tax_income Shares - Summary\label{tab:main_summary}") ///
mtitles("Top 10\%" "Top 5\%" "Top 1\%") ///
keep(28.event2 29.event2 30.event2) ///
coeflabels(28.event2 "Treatment year (T0)" 29.event2 "Year after (T+1)" 30.event2 "Two years after (T+2)") ///
stats(N r2, fmt(0 3) labels("Observations" "R-squared")) ///
addnotes("Notes: This table reports key coefficient estimates from two-stage difference-in-differences regressions." ///
"The dependent variables are post_tax_income shares held by the top 10\%, 5\%, and 1\% of earners." ///
"Treatment is the introduction of dual post_tax_income taxation. All regressions include country and year" ///
"fixed effects, and controls for business cycle, development, and production structure variables." ///
"Standard errors are clustered by country. * p$<$0.10, ** p$<$0.05, *** p$<$0.01.")
*************************** EVENT STUDY FIGURES ****************************
******************** FIGURE 1: MAIN POST_TAX_RESULTS\ PLOTS **********************
display "================================================================="
display "CREATING FIGURE 1: EVENT STUDY PLOTS - PRIMARY OUTCOMES"
display "================================================================="
// Create publication-quality event study plots
foreach outcome in $primary_outcomes {
display "Creating event study plot for `outcome'"
coefplot main_`outcome', ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ///
ciopts(recast(rcap) lwidth(medium)) ///
msymbol(circle) msize(medium) mcolor(navy) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on `outcome' (percentage points)", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(off) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\Figure1_`outcome'_event_study_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\Figure1_`outcome'_event_study_w5.png", replace
}
// Combined event study plot for main paper
coefplot (main_post_tax_top10, label("Top 10%") msymbol(circle) mcolor(navy)) ///
(main_post_tax_top5, label("Top 5%") msymbol(triangle) mcolor(maroon)) ///
(main_post_tax_top1, label("Top 1%") msymbol(square) mcolor(forest_green)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on post_tax_income share (percentage points)", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(1) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\Figure1_combined_event_study_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\Figure1_combined_event_study_w5.png", replace
*************************** EVENT STUDY FIGURES - SECONDARY OUTCOMES *********
******************** FIGURE A1: SECONDARY OUTCOMES PLOTS ******************
display "================================================================="
display "CREATING FIGURE A1: EVENT STUDY PLOTS - SECONDARY OUTCOMES"
display "================================================================="
// Individual plots for secondary outcomes
foreach outcome in $secondary_outcomes {
display "Creating event study plot for `outcome'"
coefplot main_`outcome', ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ///
ciopts(recast(rcap) lwidth(medium)) ///
msymbol(circle) msize(medium) mcolor(navy) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on `outcome' (percentage points)", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(off) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA1_`outcome'_event_study_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA1_`outcome'_event_study_w5.png", replace
}
// Combined plot for middle post_tax_income groups (Next 9%, Next 4%)
coefplot (main_post_tax_next9, label("Next 9% (P90-P99)") msymbol(circle) mcolor(navy)) ///
(main_post_tax_next4, label("Next 4% (P95-P99)") msymbol(triangle) mcolor(maroon)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on post_tax_income share (percentage points)", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(1) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA1_middle_groups_combined_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA1_middle_groups_combined_w5.png", replace
// Combined plot for bottom post_tax_income groups
coefplot (main_post_tax_bottom50, label("Bottom 50%") msymbol(circle) mcolor(navy)) ///
(main_post_tax_bottom90, label("Bottom 90%") msymbol(triangle) mcolor(maroon)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on post_tax_income share (percentage points)", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(1) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA1_bottom_groups_combined_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA1_bottom_groups_combined_w5.png", replace
*************************** EVENT STUDY FIGURES - RATIO OUTCOMES *************
******************** FIGURE A2: INEQUALITY RATIOS PLOTS ******************
display "================================================================="
display "CREATING FIGURE A2: EVENT STUDY PLOTS - INEQUALITY RATIOS"
display "================================================================="
// Individual plots for ratio outcomes
foreach outcome in $ratio_outcomes {
display "Creating event study plot for `outcome'"
coefplot main_`outcome', ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ///
ciopts(recast(rcap) lwidth(medium)) ///
msymbol(circle) msize(medium) mcolor(navy) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on `outcome'", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(off) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_`outcome'_event_study_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_`outcome'_event_study_w5.png", replace
}
// Combined plot for Top 10% ratios
coefplot (main_post_tax_ratio_t10_b50, label("Top10/Bottom50") msymbol(circle) mcolor(navy)) ///
(main_post_tax_ratio_t10_b90, label("Top10/Bottom90") msymbol(triangle) mcolor(maroon)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on inequality ratio", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(1) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_top10_ratios_combined_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_top10_ratios_combined_w5.png", replace
// Combined plot for Top 1% ratios
coefplot (main_post_tax_ratio_t1_b50, label("Top1/Bottom50") msymbol(circle) mcolor(navy)) ///
(main_post_tax_ratio_t1_b90, label("Top1/Bottom90") msymbol(triangle) mcolor(maroon)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on inequality ratio", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(1) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_top1_ratios_combined_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_top1_ratios_combined_w5.png", replace
// Combined plot for Next 9% and Next 4% ratios
coefplot (main_post_tax_ratio_n9_b50, label("Next9/Bottom50") msymbol(circle) mcolor(navy)) ///
(main_post_tax_ratio_n4_b50, label("Next4/Bottom50") msymbol(triangle) mcolor(maroon)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on inequality ratio", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(1) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_next_ratios_combined_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_next_ratios_combined_w5.png", replace
// Grand combined plot for key inequality ratios
coefplot (main_post_tax_ratio_t10_b50, label("Top10/Bot50") msymbol(circle) mcolor(navy)) ///
(main_post_tax_ratio_t1_b50, label("Top1/Bot50") msymbol(triangle) mcolor(maroon)) ///
(main_post_tax_ratio_n9_b50, label("Next9/Bot50") msymbol(square) mcolor(forest_green)) ///
(main_post_tax_ratio_n4_b50, label("Next4/Bot50") msymbol(diamond) mcolor(orange)), ///
keep(*event2) vertical ///
yline(0, lcolor(black) lwidth(thin)) ///
xline(6, lcolor(red) lpattern(dash) lwidth(medium)) ///
levels(95) ciopts(recast(rcap)) ///
xlabel(1 "T-5" 2 "T-4" 3 "T-3" 4 "T-2" 5 "T-1" 6 "T0" 7 "T+1" 8 "T+2" 9 "T+3" 10 "T+4" 11 "T+5", ///
angle(0)) ///
ytitle("Effect on inequality ratio", size(medsmall)) ///
xtitle("Years relative to DIT introduction", size(medsmall)) ///
title("") ///
graphregion(color(white)) plotregion(color(white)) ///
legend(rows(2) size(small) position(12)) ///
note("95% confidence intervals. Standard errors clustered by country." ///
"Vertical line indicates treatment year. All ratios relative to bottom 50%.", size(vsmall))
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_all_ratios_combined_w5.pdf", replace
graph export "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\FigureA2_all_ratios_combined_w5.png", replace
*************************** ROBUSTNESS ANALYSIS *****************************
******************** TABLE 2: ROBUSTNESS TO CONTROLS ********************
display "================================================================="
display "ROBUSTNESS ANALYSIS - TABLE 2"
display "================================================================="
display "Purpose: Show main results robust to alternative control specifications"
display "Selected outcomes: Top 10%, Top 1%, Next 4%, Next9/Bottom90 ratio"
display "================================================================="
// Define robustness specifications
global spec1 "$controls_cycle $controls_development"
global spec2 "$controls_cycle $controls_production"
global spec3 "$controls_cycle $controls_capital"
global spec4 "$controls_cycle $controls_external"
global spec5 "$main_controls" // This is our main specification
global spec6 "$controls_cycle $controls_development $controls_capital"
global spec7 "$controls_cycle $controls_production $controls_capital"
global spec8 "$controls_cycle $controls_development $controls_external"
local spec_names `"Baseline" "Production" "Capital" "External" "Main" "Capital+" "Supply" "Open""'
// MODIFIED: Use specific robustness outcomes instead of primary outcomes
global robustness_outcomes "post_tax_top10 post_tax_top1 post_tax_next4 post_tax_ratio_n9_b90"
display "Selected outcomes for robustness: $robustness_outcomes"
// Store results for robustness table
tempname results_file
postfile `results_file' str30 outcome byte spec double effect double se double pvalue using robustness_results_w5, replace
foreach outcome in $robustness_outcomes {
display ""
display "=== ROBUSTNESS ANALYSIS: `outcome' ==="
forvalues s = 1/8 {
display "--- Specification `s': ${spec`s'} ---"
capture did2s `outcome', first_stage(i.country_id i.year ${spec`s'}) ///
second_stage(i.event2) treatment(tax_reform_dummy) cluster(country_id)
if _rc == 0 {
estimates store rob_`s'_`outcome'
// Calculate average post-treatment effect for robustness comparison
capture lincom (28.event2 + 29.event2 + 30.event2 + 31.event2 + 32.event2 + 33.event2)/6
if _rc == 0 {
local avg_effect = r(estimate)
local avg_se = r(se)
local avg_pval = r(p)
post `results_file' ("`outcome'") (`s') (`avg_effect') (`avg_se') (`avg_pval')
display "Average post-treatment effect: " %8.4f `avg_effect' " (se=" %6.4f `avg_se' " p=" %5.3f `avg_pval' ")"
}
else {
display "ERROR: Could not calculate average effect for specification `s'"
post `results_file' ("`outcome'") (`s') (.) (.) (.)
}
}
else {
display "ERROR: Regression failed for specification `s'"
post `results_file' ("`outcome'") (`s') (.) (.) (.)
}
}
}
postclose `results_file'
*************************** CREATE ROBUSTNESS TABLES IN TXT FORMAT ***************************
// First, check which estimates actually exist
foreach outcome in $robustness_outcomes {
display ""
display "=== Checking stored estimates for `outcome' ==="
forvalues s = 1/4 {
capture estimates describe rob_`s'_`outcome'
if _rc == 0 {
display "rob_`s'_`outcome': FOUND"
}
else {
display "rob_`s'_`outcome': NOT FOUND"
}
}
}
// Create individual robustness tables in TXT format (only for existing estimates)
foreach outcome in $robustness_outcomes {
display "Creating robustness table for `outcome'"
// Check if at least the first 4 estimates exist
local has_estimates = 0
forvalues s = 1/4 {
capture estimates describe rob_`s'_`outcome'
if _rc == 0 {
local has_estimates = 1
}
}
if `has_estimates' == 1 {
// Create a simple text table
capture file open robustness_file using "Table2_robustness_`outcome'_w5.txt", write replace
if _rc == 0 {
file write robustness_file "Robustness to Control Selection: `outcome'" _n
file write robustness_file "========================================" _n _n
file write robustness_file "Specification" _tab "T0 Coeff" _tab "T0 SE" _tab "T+1 Coeff" _tab "T+1 SE" _tab "T+2 Coeff" _tab "T+2 SE" _tab "N" _n
file write robustness_file "-------------" _tab "--------" _tab "-----" _tab "---------" _tab "------" _tab "---------" _tab "------" _tab "---" _n
forvalues s = 1/4 {
capture estimates restore rob_`s'_`outcome'
if _rc == 0 {
// Get coefficients and standard errors
local spec_name : word `s' of "Baseline" "Production" "Capital" "External"
capture local t0_coef = _b[28.event2]
capture local t0_se = _se[28.event2]
capture local t1_coef = _b[29.event2]
capture local t1_se = _se[29.event2]
capture local t2_coef = _b[30.event2]
capture local t2_se = _se[30.event2]
capture local obs = e(N)
// Handle missing values
if "`t0_coef'" == "" local t0_coef = "."
if "`t0_se'" == "" local t0_se = "."
if "`t1_coef'" == "" local t1_coef = "."
if "`t1_se'" == "" local t1_se = "."
if "`t2_coef'" == "" local t2_coef = "."
if "`t2_se'" == "" local t2_se = "."
if "`obs'" == "" local obs = "."
file write robustness_file "`spec_name'" _tab %8.4f (`t0_coef') _tab %6.4f (`t0_se') _tab %8.4f (`t1_coef') _tab %6.4f (`t1_se') _tab %8.4f (`t2_coef') _tab %6.4f (`t2_se') _tab %8.0f (`obs') _n
}
else {
local spec_name : word `s' of "Baseline" "Production" "Capital" "External"
file write robustness_file "`spec_name'" _tab "." _tab "." _tab "." _tab "." _tab "." _tab "." _tab "." _n
}
}
file write robustness_file _n "Notes:" _n
file write robustness_file "This table shows robustness of main results to alternative control specifications." _n
file write robustness_file "T0 = Treatment year, T+1 = Year after, T+2 = Two years after" _n
file write robustness_file "Baseline: business cycle + development." _n
file write robustness_file "Production: business cycle + production structure." _n
file write robustness_file "Capital: business cycle + capital markets." _n
file write robustness_file "External: business cycle + external balance." _n
file write robustness_file "Standard errors clustered by country." _n
file close robustness_file
display "Created: Table2_robustness_`outcome'_w5.txt"
}
else {
display "ERROR: Could not create file for `outcome'"
}
}
else {
display "SKIPPED: No valid estimates found for `outcome'"
}
}
*************************** COEFFICIENT STABILITY TABLE IN TXT FORMAT ***************************
use robustness_results_w5, clear
// Export to TXT format instead of Excel
outsheet using "coefficient_stability_selected_outcomes_w5.txt", replace
// Create a formatted summary table
file open summary_file using "coefficient_stability_summary_w5.txt", write replace
file write summary_file "COEFFICIENT STABILITY RESULTS - SELECTED OUTCOMES" _n
file write summary_file "=================================================" _n _n
file write summary_file "Outcome" _tab "Spec1" _tab "Spec2" _tab "Spec3" _tab "Spec4" _tab "Spec5" _tab "Mean" _tab "StdDev" _tab "CV" _n
file write summary_file "-------" _tab "-----" _tab "-----" _tab "-----" _tab "-----" _tab "-----" _tab "----" _tab "------" _tab "--" _n
foreach outcome in post_tax_top10 post_tax_top1 post_tax_next4 post_tax_ratio_n9_b90 {
preserve
keep if outcome == "`outcome'"
if _N > 0 {
// Initialize locals with missing values
local eff1 = .
local eff2 = .
local eff3 = .
local eff4 = .
local eff5 = .
// Get effects for each specification (proper Stata syntax)
forvalues s = 1/5 {
capture local temp_eff = effect[`s'] if spec[`s'] == `s'
if _rc == 0 & !missing(effect[`s']) {
local eff`s' = effect[`s']
}
}
// Alternative method: loop through observations
forvalues i = 1/`=_N' {
if spec[`i'] == 1 & !missing(effect[`i']) {
local eff1 = effect[`i']
}
if spec[`i'] == 2 & !missing(effect[`i']) {
local eff2 = effect[`i']
}
if spec[`i'] == 3 & !missing(effect[`i']) {
local eff3 = effect[`i']
}
if spec[`i'] == 4 & !missing(effect[`i']) {
local eff4 = effect[`i']
}
if spec[`i'] == 5 & !missing(effect[`i']) {
local eff5 = effect[`i']
}
}
// Calculate summary statistics
summarize effect if !missing(effect), detail
if r(N) > 0 {
local mean_val = r(mean)
local sd_val = r(sd)
// Calculate coefficient of variation
if `mean_val' != 0 {
local cv_val = abs(`sd_val'/`mean_val')
}
else {
local cv_val = .
}
}
else {
local mean_val = .
local sd_val = .
local cv_val = .
}
// Write to file (handle missing values)
if `eff1' == . local eff1_str = "."
else local eff1_str = string(`eff1', "%6.4f")
if `eff2' == . local eff2_str = "."
else local eff2_str = string(`eff2', "%6.4f")
if `eff3' == . local eff3_str = "."
else local eff3_str = string(`eff3', "%6.4f")
if `eff4' == . local eff4_str = "."
else local eff4_str = string(`eff4', "%6.4f")
if `eff5' == . local eff5_str = "."
else local eff5_str = string(`eff5', "%6.4f")
if `mean_val' == . local mean_str = "."
else local mean_str = string(`mean_val', "%6.4f")
if `sd_val' == . local sd_str = "."
else local sd_str = string(`sd_val', "%6.4f")
if `cv_val' == . local cv_str = "."
else local cv_str = string(`cv_val', "%6.4f")
file write summary_file "`outcome'" _tab "`eff1_str'" _tab "`eff2_str'" _tab "`eff3_str'" _tab "`eff4_str'" _tab "`eff5_str'" _tab "`mean_str'" _tab "`sd_str'" _tab "`cv_str'" _n
// Display interpretation
display ""
display "--- `outcome' ---"
display "Mean effect: `mean_str'"
display "Std deviation: `sd_str'"
display "Coefficient of variation: `cv_str'"
if `cv_val' != . {
if `cv_val' < 0.1 {
display "INTERPRETATION: Highly robust (CV < 0.1)"
}
else if `cv_val' < 0.3 {
display "INTERPRETATION: Moderately robust (CV < 0.3)"
}
else {
display "INTERPRETATION: Sensitive to controls (CV >= 0.3)"
}
}
}
restore
}
file write summary_file _n "Notes:" _n
file write summary_file "Spec1=Baseline, Spec2=Production, Spec3=Capital, Spec4=External, Spec5=Main" _n
file write summary_file "CV = Coefficient of Variation (StdDev/Mean)" _n
file write summary_file "CV < 0.1: Highly robust" _n
file write summary_file "CV < 0.3: Moderately robust" _n
file write summary_file "CV >= 0.3: Sensitive to controls" _n
file close summary_file
display ""
display "================================================================="
display "FILES CREATED (TXT FORMAT):"
display "- robustness_results_w5.dta: Raw robustness results"
display "- coefficient_stability_selected_outcomes_w5.txt: CSV format"
display "- coefficient_stability_summary_w5.txt: Formatted summary"
display "- Table2_robustness_[outcome]_w5.txt: Individual outcome tables"
display "================================================================="
*************************** ROBUSTNESS STATISTICS FOR ACADEMIC DISCUSSION ****************************
display "================================================================="
display "ROBUSTNESS STATISTICS FOR ACADEMIC DISCUSSION"
display "================================================================="
use robustness_results_w5, clear
foreach outcome in post_tax_top10 post_tax_top1 post_tax_next4 post_tax_ratio_n9_b90 {
display ""
display "--- Robustness Statistics: `outcome' ---"
preserve
keep if outcome == "`outcome'"
drop if missing(effect)
if _N > 0 {
summarize effect, detail
local mean_effect = r(mean)
local min_effect = r(min)
local max_effect = r(max)
local sd_effect = r(sd)
local range_effect = r(max) - r(min)
display "Mean effect across specifications: " %8.4f `mean_effect'
display "Standard deviation across specs: " %8.4f `sd_effect'
display "Range (max - min): " %8.4f `range_effect'
if `mean_effect' != 0 {
local cv_effect = abs(`sd_effect'/`mean_effect')
display "Coefficient of variation: " %8.4f `cv_effect'
// Academic interpretation
if `cv_effect' < 0.1 {
display "INTERPRETATION: Highly robust (CV < 0.1)"
display "TEXT: 'Results are highly robust to control specification (CV = " %5.3f `cv_effect' ")'"
}
else if `cv_effect' < 0.3 {
display "INTERPRETATION: Moderately robust (CV < 0.3)"
display "TEXT: 'Results show moderate robustness to control specification (CV = " %5.3f `cv_effect' ")'"
}
else {
display "INTERPRETATION: Sensitive to controls (CV >= 0.3)"
display "TEXT: 'Results show some sensitivity to control specification (CV = " %5.3f `cv_effect' ")'"
}
}
// Show range of estimates for academic text
display ""
display "FOR ACADEMIC TEXT:"
display "The estimated effect ranges from " %6.4f `min_effect' " to " %6.4f `max_effect' " percentage points"
display "across different control specifications (mean = " %6.4f `mean_effect' " pp)."
}
restore
}
log close
// Clean up and export robustness data
use robustness_results_w5, clear
export delimited using "C:\Users\Michele\Desktop\DIT paper\Post_Tax_Results\Relaxed\DID2S\window_5\robustness_summary_academic_w5.csv", replace
display "================================================================="
display "ACADEMIC ANALYSIS COMPLETED"
display "All tables and figures ready for academic publication"
display "All files saved to: window_5 subfolder with w5 suffix"
display "================================================================="
import os
import time
import PyPDF2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def analyze_pdf(file_path, keywords):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
found_words = []
for page_num, page in enumerate(reader.pages, 1):
text = page.extract_text()
lines = text.split('\n')
for keyword in keywords:
keyword_lower = keyword#.lower()
for line_num, line in enumerate(lines, 1):
if keyword_lower in line:#.lower():
position = f"Page {page_num}, Line {line_num}"
found_words.append((keyword, position))
return found_words
# Setup the webdriver
options = webdriver.ChromeOptions()
download_dir = "C:\\Users\\Michele\\Desktop\\Oxford\\File Prezzi Beni Agricoli"
prefs = {"download.default_directory": download_dir}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)
url = "https://www.gazzettaufficiale.it/ricerca/pdf/preRsi/foglio_ordinario1/1/0/0?reset=true"
year = 1868
# Keywords to search for in PDFs
keywords = ["TAB"]
# wait 5 seconds
time.sleep(5)
# Navigate to the page
driver.get(url)
# Wait for and select the year
year_dropdown = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "annoPubblicazione"))
)
Select(year_dropdown).select_by_value(str(year))
# Find and click the "Cerca" button
cerca_button = driver.find_element(By.XPATH, "//input[@type='submit' and @value='Cerca']")
cerca_button.click()
# Wait for the results page to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "elenco_pdf"))
)
# Find all download links
download_links = driver.find_elements(By.XPATH, "//a[contains(@class, 'download_pdf')]")
# create a folder to store the log file, called with the current date and time
import datetime
now = datetime.datetime.now()
log_folder = now.strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(log_folder, exist_ok=True)
# Process each PDF
for i, link in enumerate(download_links):
# Click download link
link.click()
time.sleep(5) # Wait for download to complete
# Get the name of the most recently downloaded file
list_of_files = os.listdir(download_dir)
latest_file = max([os.path.join(download_dir, f) for f in list_of_files], key=os.path.getctime)
print(f"Processing {latest_file}")
# Analyze the PDF
found_words = analyze_pdf(latest_file, keywords)
print(f"Found {len(found_words)} keywords")
if found_words:
# Keep the file and log the findings
log_file_name = "pdf_analysis_log.txt"
log_file_path = os.path.join(log_folder, log_file_name)
with open(log_file_path, 'a') as log_file:
log_file.write(f"File: {latest_file}\n")
for word, location in found_words:
log_file.write(f"Found keyword '{word}' at {location}\n")
log_file.write("\n\n")
else:
# Delete the file if no keywords found
os.remove(latest_file)
print(f"Deleted {latest_file}")
print(f"Processed file {i+1} of {len(download_links)}")
print()
if i == 30:
break
#print("Waiting for 2 seconds")
#time.sleep(2)
print("All PDFs have been processed.")
# Close the browser
driver.quit()