
/* ////////////////////////////////////////////////////////////////////////////////////////////////
Name: 			clean_refxp.do
Description: 	Cleaning refs data from UCL, World Cup, Confederations Cup, Copa Libertadores, and the Asian Cup
				Then merging the data with the dta final_treatment_wrefs and creating a new .dta called:
				final_treatment_wrefxp

Notes: 			- Created by Camilo 
				- Last updated 9/15/2022
//////////////////////////////////////////////////////////////////////////////////////////////// */

* Set up environment ------------------------------------------------------------------------------
clear
set more off
version 16.0

*set scheme plotplainblind

*set processors 24
*set max_memory 115g

*Add in your file path with cap in front
cap cd "C:/Users/jcross/Dropbox/HFA and VAR"
cap cd "C:\Users\uhrig_R\Dropbox\HFA and VAR"
cap cd "C:\Users\richa\Dropbox\HFA and VAR"
cap cd "C:\Users\camel\Dropbox\HFA and VAR"

* Raw data
global rawdata "1_data/0_raw"
global cleandata "1_data/1_clean"
global finaldata "1_data/2_final"
global refxpdata "1_data\4_refxp"

* Results
global graphs "4_results/Figures"
global regressions "4_results/Tables"

**************** Grab data scrapped by Richard on UCL, World Cup, 
**************** Confederations Cup, Copa Libertadores, and the Asian Cup  *****************
	use "${refxpdata}/refxp.dta", clear
	
	

* get rid of empty referees:
drop if referee == ""

* Change the name of the referees to UPPERCASE:
gen referee_upper = referee
replace referee_upper = upper(strtrim(stritrim(subinstr(referee_upper, "."," ",.))))

* Rather than replacing particular strings, seems better to replace the whole referee name: 

*afc:
tab referee_upper if country == "afc"

replace referee_upper = subinstr(referee_upper, "CÃ©SAR ARTURO RAMOS" , "CESAR ARTURO RAMOS",.)
replace referee_upper = subinstr(referee_upper, "IL'GIZ TANTASHEV" , "ILGIZ TANTASHEV",.)
replace referee_upper = subinstr(referee_upper, "PETER O'LEARY" , "PETER OLEARY",.)
replace referee_upper = subinstr(referee_upper, "RYÅ«JI SATÅ" , "RYUJI SATO",.)

tab referee_upper if country == "afc"


tab referee_upper if country == "clib"
* for clib makes more sense to replace some regular strings:
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã¡","A",.),"Ã©","E",.),"Ã­","I",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ãº","A",.),"Ã±","",.),"î","I",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ô","O",.),"ï","I",.),"Ã¡","A",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"î","I",.),"Ã","A",.),"Ã³","O",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã©","E",.),"Ã±","N",.),"Ãº","U",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã­","I",.),"Ö","O",.),"A©","E",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ı","I",.),"Ş","S",.),"ş","S",.)

tab referee_upper if country == "clib"

replace referee_upper = subinstr(referee_upper, "ANGEL ARTEAGA" , "ANGEL ARTEAGA",.)
replace referee_upper = subinstr(referee_upper, "AMER MACHADO" , "IMER MACHADO",.)
replace referee_upper = subinstr(referee_upper, "ASCAR MALDONADO" , "OSCAR MALDONADO",.)
replace referee_upper = subinstr(referee_upper, "ASCAR ROJAS" , "OSCAR ROJAS",.)
replace referee_upper = subinstr(referee_upper, "JHON ALVAREZ" , "JHON ALVAREZ",.)


tab referee_upper if country == "confc"
replace referee_upper = subinstr(subinstr(referee_upper,"A¶","O",.),"A§","Z",.)

replace referee_upper = subinstr(referee_upper, "DJAMEL HAA¯MOUDI" , "DJAMEL HAIMOUDI",.)
replace referee_upper = subinstr(referee_upper, "MILORAD MAÅ¾IÄ" , "MILORAD MAZIC",.)

tab referee_upper if country == "confc"


tab referee_upper if country == "ucl"
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"A®","I",.),"Å¾","Z",.),"A©","E",.)

replace referee_upper = subinstr(referee_upper, "ALIAKSEI KULBAKOU" , "ALIAKSEI KULBAKOV",.)

replace referee_upper = subinstr(referee_upper, "ARTUR DIAS" , "ARTUR SOARES DIAS",.)

replace referee_upper = subinstr(referee_upper, "BARIS ÅIMÅEK" , "BARIS SIMSEK",.)

replace referee_upper = subinstr(referee_upper, "CA¼NEYT AAKÄ±R" , "CUNEYT ZAKIR",.)
replace referee_upper = subinstr(referee_upper, "CA¼NEYT AAKÎ¹R" , "CUNEYT ZAKIR",.)

replace referee_upper = subinstr(referee_upper, "DANILO GRUJIÄ" , "DANILO GRUJIC",.)

replace referee_upper = subinstr(referee_upper, "FÄ±RAT AYDÄ±NUS" , "FIRAT AYDINUS",.)

replace referee_upper = subinstr(referee_upper, "MATEU LAHOZ" , "ANTONIO MATEU LAHOZ",.)

replace referee_upper = subinstr(referee_upper, "OVIDIU HAÅ£EGAN" , "OVIDIU HATEGAN",.)
replace referee_upper = subinstr(referee_upper, "OVIDIU HAÈEGAN" , "OVIDIU HATEGAN",.)

replace referee_upper = subinstr(referee_upper, "PAWEÅ" , "PAWEL",.)

replace referee_upper = subinstr(referee_upper, "SANDOR ANDA³-SZABA³" , "VIKTOR KASSAI",.)
 
replace referee_upper = subinstr(referee_upper, "SERGEI KARASEV" , "SERGEY KARASEV",.)

*I can't explain it but we actually need these 4 lines -- DO NOT TOUCH 
replace referee_upper = subinstr(referee_upper, "SLAVKO VINÄIÄ" , "SLAVKO VINCIC",.)
replace referee_upper = subinstr(referee_upper, "SLAVKO VINÄIÄ" , "SLAVKO VINCIC",.)
replace referee_upper = subinstr(referee_upper, "SLAVKO VINCIC" , "SLAVKO VINCIC",.)
replace referee_upper = subinstr(referee_upper, "SLAVKO VINCIC" , "SLAVKO VINCIC",.)

replace referee_upper = subinstr(referee_upper, "TOMASZ MUSIAÅ" , "TOMASZ MUSIAL",.)

*FORGOT a couple:
replace referee_upper = subinstr(referee_upper, "MANUEL GRA¤FE" , "MANUEL GRAFE",.) 

replace referee_upper = subinstr(referee_upper, "ANTONIO ANTONIO MATEU LAHOZ" , "ANTONIO MATEU LAHOZ",.)

tab referee_upper if country == "ucl"

tab referee_upper if country == "wc"

*I came back after the merge to replace these and solidify the merge:
* and also to just correct some silly duplicateds in case we ever need them:


replace referee_upper = subinstr(referee_upper, "PAULO CESAR DE OLIVEIRA" , "PAULO CESAR OLIVEIRA",.)

replace referee_upper = subinstr(referee_upper, "RAAL OROSCO" , "RAUL OROZCO",.)
replace referee_upper = subinstr(referee_upper, "RAAL OROZCO" , "RAUL OROZCO",.)

replace referee_upper = subinstr(referee_upper, "SAAL LAVERNI" , "SAUL LAVERNI",.)

replace referee_upper = subinstr(referee_upper, "WILTON SAMPAIO" , "WILTON PEREIRA SAMPAIO",.)

replace referee_upper = subinstr(referee_upper, "CARLOS DEL CERRO" , "CARLOS DEL CERRO GRANDE",.)

replace referee_upper = subinstr(referee_upper, "CARLOS VELASCO" , "CARLOS VELASCO CARBALLO",.)


replace referee_upper = subinstr(referee_upper, "FERNANDEZ BORBALAN" , "DAVID FERNANDEZ BORBALAN",.)
replace referee_upper = subinstr(referee_upper, "DAVID FERNANDEZ" , "DAVID FERNANDEZ BORBALAN",.)
replace referee_upper = subinstr(referee_upper, "DIEGO FERNANDEZ" , "DAVID FERNANDEZ BORBALAN",.)

*AGAIN, not the brighest code, but gets the job done:
replace referee_upper = subinstr(referee_upper, "DAVID FERNANDEZ BORBALAN BORBALAN" , "DAVID FERNANDEZ BORBALAN",.)

replace referee_upper = subinstr(referee_upper, "JESAS GIL" , "JESUS GIL MANZANO",.)

replace referee_upper = subinstr(referee_upper, "JORGE DE SOUSA" , "JORGE SOUSA",.)

replace referee_upper = subinstr(referee_upper, "RICARDO DE BURGOS" , "RICARDO DE BURGOS BENGOETXEA",.)

replace referee_upper = subinstr(referee_upper, "UNDIANO MALLENCO" , "ALBERTO UNDIANO",.)

replace referee_upper = subinstr(referee_upper, "VELASCO CARBALLO" , "CARLOS VELASCO CARBALLO",.)

*ONE MORE:
replace referee_upper = subinstr(referee_upper, "CARLOS CARLOS VELASCO CARBALLO" , "CARLOS VELASCO CARBALLO",.)

*Right now, the dataset is ready to be merged.. Since the information we need is which ref name appears information
*certain seasons, let's just keep that (country with a different name though country_2, same for season) 


gen country_2 = country

gen season_2 = real(season)

*we only need a couple of columns:

keep referee_upper country_2 season_2

*we want to avoid repeating referees, for example Felix Brych appears in ucl,wc and confc:
* We decided to collapse by the first time they show up in any competition:
bys referee_upper (season_2): gen num_match = _n 
*bys referee_upper: egen first_year = min(season_2)
*collapse (first) season_2, by(referee_upper country_2)


keep if num_match == 1



* rename variable season_2, let's call it international_first
rename season_2 international_first

tab referee_upper

drop num_match

label variable country_2 "international tournament"

cd "C:\Users\camel\Dropbox\HFA and VAR\1_data\2_final"

merge 1:m referee_upper using  "final_treatment_wrefs", force generate(last_merge)



* CHECKING IF THE MERGE WAS SUCCESFULL:
* country by country!! tab referee_upper if last_merge == 1 & country == "afc""
tab referee_upper country_2 if last_merge == 1
* 

tab referee_upper if last_merge == 1 & country_2 == "clib"


tab referee_upper if last_merge == 1 & country_2 == "confc"

tab referee_upper if last_merge == 1 & country_2 == "ucl"

* I think the match is reliable.. 

* Let's just drop if last_merge==1, meaning, those refs that do not appear in final_treatment:

drop if last_merge==1



* Let's create a dummy for ref experience: 
* If the ref ever participated in an international tournament, the value experience will be 1, 0 otherwise:
* tiny problem with the season format, solved here:

gen len = strlen(string(international_first))

gen international_since =.
replace international_since = international_first if len>2
replace international_since = international_first*100 + international_first if len<=2

drop len

gen len = strlen(string(season))

gen season_comparison =.
replace season_comparison = season if len>2
replace season_comparison = season*100 + season if len<=2

drop len


gen experienced_ref = .
replace experienced_ref = 1 if international_since != . & international_since <= season_comparison
replace experienced_ref = 0 if experienced_ref == .
label variable experienced_ref   "1 if referee was international in that season or before"

* br referee_upper season season_comparison international_since international_first experience

* variables that might be good to include, about refs:

* apps: appereances (over the whole season), indicates experience, should correlate negatively with goal_diff I believe. (use apps_overall)

* pen_pg: penalties per game? There's pen_pg_home and
* pen_pg_away, probably the difference between these two also correlates with goal_diff, but positively
* Idea of ref bias?

* red_pg_home - red_pg_away might also serve as a proxy of ref bias
* try as well with yellow home away difference 

*and also, experienced_ref (dummy for refs with international experience)
 
*let's generate some differences to use them in regressions:

gen ref_pen_dif=.
replace ref_pen_dif = pen_pg_home - pen_pg_away
label variable ref_pen_dif   "Diff penalties given by ref per game, home - away"

gen ref_yellow_dif =.
replace ref_yellow_dif = yel_pg_home - yel_pg_away
label variable ref_yellow_dif   "Diff yellows given by ref per game, home - away"

gen ref_red_dif =.
replace ref_red_dif = red_pg_home - red_pg_away
label variable ref_red_dif   "Diff reds given by ref per game, home - away"

*maybe include interaction of experienced_ref and VAR?

gen var_ref =.
replace var_ref = VAR*experienced_ref
label variable var_ref   "dummy interaction VAR and experienced referee"




* Ok, data ready to be saved and used in regressions:

save "C:\Users\camel\Dropbox\HFA and VAR\1_data\2_final\final_treatment_wrefs_experience.dta"


