
/* ////////////////////////////////////////////////////////////////////////////////////////////////
Name: 			camilo_refs.do
Description: 	Matching refs with games, from the ref data in whoscored.com 

Notes: 			- Created by Camilo 
				- Last updated 9/7/2022
//////////////////////////////////////////////////////////////////////////////////////////////// */

* Set up environment ------------------------------------------------------------------------------
clear
set more off
version 16.0

*set scheme plotplainblind

*set processors 24
*set max_memory 115g

*Add in your file path with cap in front
cap cd "C:/Users/jcross/Dropbox/HFA and VAR"
cap cd "C:\Users\uhrig_R\Dropbox\HFA and VAR"
cap cd "C:\Users\richa\Dropbox\HFA and VAR"
cap cd "C:\Users\camel\Dropbox\HFA and VAR"

* Raw data
global rawdata "1_data/0_raw"
global cleandata "1_data/1_clean"
global finaldata "1_data/2_final"

* Results
global graphs "4_results/Figures"
global regressions "4_results/Tables"

**************** Grab new data  *****************
	use "${finaldata}/final_treatment", clear

	
* Change the name of the referees to UPPERCASE:
gen referee_upper = referee
replace referee_upper = upper(strtrim(stritrim(subinstr(referee_upper, "."," ",.))))

* Replace some special strings, example Ã 
* Some strings stayed in lowercase, example SRđAN JOVANOVIć or
* SéBASTIEN DESIAGE
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"á","A",.),"é","E",.),"í","I",.)
* Do it in many steps:
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ó","O",.),"ú","U",.),"ä","A",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ö","O",.),"ü","U",.),"ç","Z",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ã","A",.),"ê","E",.),"Ã¼","U",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã¤","A",.),"Ã¶","O",.),"î","I",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ô","O",.),"ï","I",.),"Ã¡","A",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"î","I",.),"Ã","A",.),"Ã³","O",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã©","E",.),"Ã±","N",.),"Ãº","U",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"Ã­","I",.),"Ö","O",.),"A©","E",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ı","I",.),"Ş","S",.),"ş","S",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"ğ","G",.),"Ç","Z",.),"ι","I",.)
replace referee_upper = subinstr(subinstr(subinstr(referee_upper,"A³","O",.),"A©","E",.),"A±","N",.)
replace referee_upper = subinstr(subinstr(referee_upper,"Aº","U",.),"A­","I",.)


* And now unifying particular refs:

* france
replace referee_upper = subinstr(referee_upper, "FRANCK SCHNEIDER" , "FRANK SCHNEIDER",.)
replace referee_upper = subinstr(referee_upper, "ALEXANDRE PERREAU-NIEL" , "ALEXANDRE PERREAU NIEL",.)
replace referee_upper = subinstr(referee_upper, "PHILLIPE MALIGE" , "PHILIPPE MALIGE",.)
replace referee_upper = subinstr(referee_upper, "HAKIM BEN EL HADJ" , "HAKIM BEN EL HADJ SALEM",.)

*fastest way I found to solve this for now:
replace referee_upper = subinstr(referee_upper, "HAKIM BEN EL HADJ SALEM SALEM" , "HAKIM BEN EL HADJ SALEM",.)


* spain
replace referee_upper = subinstr(referee_upper, "ALFONSO ALVAREZ" , "ALFONSO ALVAREZ",.) 
replace referee_upper = subinstr(referee_upper, "ALEJANDRO FERNANDEZ" , "ALEJANDRO HERNANDEZ",.)
replace referee_upper = subinstr(referee_upper, "CARLOS DEL CERRO" , "CARLOS DEL CERRO GRANDE",.)
replace referee_upper = subinstr(referee_upper, "CARLOS VELASCO" , "CARLOS VELASCO CARBALLO",.)
replace referee_upper = subinstr(referee_upper, "DAVID FERNANDEZ" , "DAVID FERNANDEZ BORBALAN",.)
replace referee_upper = subinstr(referee_upper, "DANIEL TRUJILLO" , "DANIEL JESUS TRUJILLO SUAREZ",.)
replace referee_upper = subinstr(referee_upper, "EDUARDO PRIETO" , "EDUARDO PRIETO IGLESIAS",.) 
replace referee_upper = subinstr(referee_upper, "FERNANDO TEIXEIRA" , "FERNANDO TEIXEIRA VITIENES",.) 
replace referee_upper = subinstr(referee_upper, "GUILLERMO CUADRA" , "GUILLERMO CUADRA FERNANDEZ",.) 
replace referee_upper = subinstr(referee_upper, "IGNACIO IGLESIAS" , "IGNACIO IGLESIAS VILLANUEVA",.) 
replace referee_upper = subinstr(referee_upper, "RICARDO DE BURGOS" , "RICARDO DE BURGOS BENGOETXEA",.) 
replace referee_upper = subinstr(referee_upper, "JESUS GIL" , "JESUS GIL MANZANO",.) 
replace referee_upper = subinstr(referee_upper, "INAKI BIKANDI" , "INAKI VICANDI GARRIDO",.) 
replace referee_upper = subinstr(referee_upper, "JOSE TEIXEIRA" , "JOSE ANTONIO TEIXEIRA VITIENES",.) 
replace referee_upper = subinstr(referee_upper, "JOSE GONZALEZ" , "JOSE LUIS GONZALEZ GONZALEZ",.) 
replace referee_upper = subinstr(referee_upper, "JOSE LUIS MUNUERA" , "JOSE LUIS MUNUERA MONTERO",.) 
replace referee_upper = subinstr(referee_upper, "JOSE MUNUERA" , "JOSE LUIS MUNUERA MONTERO",.) 
replace referee_upper = subinstr(referee_upper, "JOSE SANCHEZ" , "JOSE MARIA SANCHEZ MARTINEZ",.) 
replace referee_upper = subinstr(referee_upper, "ALBEROLA ROJAS" , "JAVIER ALBEROLA ROJAS",.) 
replace referee_upper = subinstr(referee_upper, "CARLOS CLOS" , "CARLOS CLOS GOMEZ",.) 
replace referee_upper = subinstr(referee_upper, "JAVIER ESTRADA" , "XAVIER ESTRADA FERNANDEZ",.) 
replace referee_upper = subinstr(referee_upper, "MARIO MELERO" , "MARIO MELERO LOPEZ",.) 
replace referee_upper = subinstr(referee_upper, "PEDRO PEREZ" , "PEDRO JESUS PEREZ MONTERO",.) 
replace referee_upper = subinstr(referee_upper, "SANTIAGO JAIME" , "SANTIAGO JAIME LATRE",.) 

*fastest way I found to solve this for now:
replace referee_upper = subinstr(referee_upper, "CARLOS CLOS GOMEZ GOMEZ" , "CARLOS CLOS GOMEZ",.) 


* portugal

replace referee_upper = subinstr(referee_upper, "CARLOS XISTRA" , "CARLOS TABORDA XISTRA",.)
replace referee_upper = subinstr(referee_upper, "JOAO PINHEIRO" , "JOAO PEDRO PINHEIRO",.)
replace referee_upper = subinstr(referee_upper, "FABIO COSTA" , "FABIO JOSE COSTA VERISSIMO",.)
replace referee_upper = subinstr(referee_upper, "JOAO CAPELA" , "JOAO SANTOS CAPELA",.)
replace referee_upper = subinstr(referee_upper, "JORGE DE SOUSA" , "JORGE SOUSA",.)
replace referee_upper = subinstr(referee_upper, "JOAO PINTO" , "JOAO MALHEIRO PINTO",.)
replace referee_upper = subinstr(referee_upper, "NUNO ALMEIDA" , "NUNO MIGUEL SERRANO ALMEIDA",.)


*england:
replace referee_upper = subinstr(referee_upper, "NIEL SWARBRICK" , "NEIL SWARBRICK",.)
replace referee_upper = subinstr(referee_upper, "PHILIP DOWD" , "PHIL DOWD",.)


* FORGOT in brazil:
replace referee_upper = subinstr(referee_upper, "SANDRO RICCI" , "SANDRO MEIRA RICCI",.)
replace referee_upper = subinstr(referee_upper, "SAVIO PEREIRA" , "SAVIO PEREIRA SAMPAIO",.)
replace referee_upper = subinstr(referee_upper, "WILTON SAMPAIO" , "WILTON PEREIRA SAMPAIO",.)

*russia:
replace referee_upper = subinstr(referee_upper, "ALEKSEI MATYUNIN" , "ALEKSEY MATYUNIN",.)
replace referee_upper = subinstr(referee_upper, "ALEKSEI SUHOY" , "ALEKSEY SUKHOY",.)
replace referee_upper = subinstr(referee_upper, "ALEXEY NIKOLAEV" , "ALEKSEY NIKOLAEV",.)
replace referee_upper = subinstr(referee_upper, "ALEKSEI NIKOLAEV" , "ALEKSEY NIKOLAEV",.)
replace referee_upper = subinstr(referee_upper, "ROMAN" , "ROMAN GALIMOV",.)
replace referee_upper = subinstr(referee_upper, "SERGEY IVANOV" , "SERGEI IVANOV",.)
replace referee_upper = subinstr(referee_upper, "SERGEY LAPOCHKIN" , "SERGEI LAPOCHKIN",.)
replace referee_upper = subinstr(referee_upper, "VASILIY KAZARTSEV" , "VASILY KAZARTSEV",.)


* replace referee_upper = subinstr(referee_upper, "VITALY MESHKOV" , ,.)

*italy:
replace referee_upper = subinstr(referee_upper, "FRANCESCO SAIA" , "FRANCESCO PAOLO SAIA",.)
replace referee_upper = subinstr(referee_upper, "GIAMPAOLO CALVARESE" , "GIANPAOLO CALVARESE",.)
replace referee_upper = subinstr(referee_upper, "IVAN PEZZUTO" , "IVANO PEZZUTO",.)
replace referee_upper = subinstr(referee_upper, "JUAN SACCHI" , "JUAN LUCA SACCHI",.)
replace referee_upper = subinstr(referee_upper, "PAOLO MAZZOLENI" , "PAOLO SILVIO MAZZOLENI",.)

*germany
replace referee_upper = subinstr(referee_upper, "THORSTEN KINHÖFER" , "THORSTEN KINHOFER",.)


*turkey:
replace referee_upper = subinstr(referee_upper, "DENIZ BITNEL" , "DENIZ ATES BITNEL",.)
replace referee_upper = subinstr(referee_upper, "HALIL MELER" , "HALIL UMUT MELER",.)
replace referee_upper = subinstr(referee_upper, "KUTLUHAN BILGIZ" , "KUTLUHAN BILGIC",.)

replace referee_upper = subinstr(referee_upper, "MUSTAFA ABITOGLU" , "MUSTAFA KAMIL ABITOGLU",.)

replace referee_upper = subinstr(referee_upper, "MUSTAFA KAMIL" , "MUSTAFA KAMIL ABITOGLU",.)
replace referee_upper = subinstr(referee_upper, "MUSTAFA COSKUN" , "MUSTAFA ILKER COSKUN",.)

replace referee_upper = subinstr(referee_upper, "SARPER SAKA" , "SARPER BARIS SAKA",.)
replace referee_upper = subinstr(referee_upper, "SEZGIN ZINAR" , "SEZGIN CINAR",.)

replace referee_upper = subinstr(referee_upper, "CAGATAY SAHAN" , "ZAGATAY SAHAN",.)

*again, bad coding, but fastest way:
replace referee_upper = subinstr(referee_upper, "MUSTAFA KAMIL ABITOGLU ABITOGLU" , "MUSTAFA KAMIL ABITOGLU",.)



/* 
tostring season, replace  /// make it string
*/


cd "C:\Users\camel\Dropbox\HFA and VAR\1_data\1_clean"

merge m:1 referee_upper season country using "ref_full"

* CHECKING IF THE MERGE WAS SUCCESFULL:
* country by country!! tab referee_upper if _merge == 1 & country == "brazil"
tab referee_upper if _merge == 1 & country == "brazil"

tab referee_upper season if _merge == 1 & country == "brazil"

tab referee_upper if _merge == 1 & country == "bundesliga"

tab referee_upper season if _merge == 1 & country == "bundesliga"

tab referee_upper if _merge == 1 & country == "dutch"

* keep in mind though that the data for dutch referees starts at 2014 :/
tab referee_upper season if _merge == 1 & country == "dutch"
*no idea who GEREMI and KAREL VAN DEN HEUVEL ARE


tab referee_upper season if _merge == 1 & country == "france"


tab referee_upper if _merge == 1 & country == "la_liga"
tab referee_upper season if _merge == 1 & country == "la_liga"


tab referee_upper if _merge == 1 & country == "portugal"
*portugal data starts in 2016/2017 in whoscored 
tab referee_upper season if _merge == 1 & country == "portugal"


tab referee_upper if _merge == 1 & country == "premier_league"

tab referee_upper season if _merge == 1 & country == "russia"

tab referee_upper season if _merge == 1 & country == "serie_a"

tab referee_upper season if _merge == 1 & country == "turkey"


* There's nothing to do with those refs, the good news is that they only appear once or twice per season
* meaning in only one or two games...

* drop _merge==2, we don't need them at all:

drop if _merge==2

save "C:\Users\camel\Dropbox\HFA and VAR\1_data\2_final\final_treatment_wrefs.dta"


