//Directory for Windows user:
//cd %systemdrive%\Users\%username%\Downloads\Codes\03_EBA\Source_datasets\EBA_processed\
//Directory for MacOS user:
// cd ~/Downloads/Codes/03_EBA/Source_datasets/EBA_processed/

insheet using ..\EBA_Dictionary.csv, delimiter(";") clear names
save ../EBA_Dictionary.dta, replace

insheet using all_cons-count.csv, delimiter(";") nonames clear
	split v1, p(".txt")
	drop v1
	rename v11 name_entity
	rename v3 category
	rename v4 occurrence
	rename v2 key 
	
	drop if name_entity == ".DS_Store"
	
	rename key key_orig
save tmp3.dta, replace  // used later to restrict to words appearing in the ITS only.

merge m:1 key_orig using ../EBA_Dictionary.dta
	keep if _merge == 3
	drop _merge 
	
	sort name_entity cons_key key_orig 
	order name_entity key_orig cons_key

	bysort name_entity cons_key: egen cons_count = sum(occurrence)
	bysort name_entity cons_key: gen has_duplicates = _N
	
	drop occurrence key_orig
	duplicates drop
save tmp.dta, replace
	
// CONTINUE WITH ORIGINAL COMPUTATION OF MEASURES
use tmp.dta, clear

	// then compute
	gen one = 1
	bysort name_entity category: egen unique_count = total(one)
	
	order name_entity category cons_count

	bysort name_entity category: egen foo = sum(cons_count)
	rename foo tot_occurence
	
	keep name_entity category tot_occurence unique_count
	duplicates drop
	
	// complexity measures
	egen id_name = group(name_entity)
	egen id_cat = group(category)
save tmp2.dta, replace

// interlude: create mappings
use tmp2.dta, clear
	keep id_name name_entity
	duplicates drop
save id_name_entity.dta, replace

// interlude: create dependent variables for Bankers data
insheet using ../Bankers_Data.csv, names delimiter(";") clear
	foreach var of varlist share_large share_medium share_snci {
		replace `var' = subinstr(`var', ",", ".", .)
		destring `var', replace
	}
merge 1:1 name_entity using id_name_entity.dta
	keep if _merge == 3
	drop _merge 
save Bankers_Data.dta, replace

insheet using ../Bankers_Cells.csv, names delimiter(";") clear
save ../Bankers_Cells.dta, replace
	
// interlude: create dependent variables for Regulators data
insheet using ../Regulators_Data.csv, names delimiter(";") clear
	foreach var of varlist highly less important not noview{
		replace `var' = subinstr(`var', ",", ".", .)
		destring `var', replace
	}
merge 1:1 name_entity using id_name_entity.dta
	keep if _merge == 3
	drop _merge 
save Regulators_Data.dta, replace

insheet using ../Regulators_Cells.csv, names delimiter(";") clear
save ../Regulators_Cells.dta, replace

// continue with main dataset
use tmp2.dta, clear
	// check that we have the right number of categories
	levelsof category, local(cat_levels)
	local num_levels : word count `cat_levels'
	if `num_levels' != 8 {
		display as error "Error: Variable category takes `num_levels' unique value(s)."
		exit 1
	}

	// compute complexity measures
	drop category name_entity
	order id_name id_cat
	xtset id_name id_cat

	reshape wide tot_occurence unique_count, i(id_name) j(id_cat)
	
	rename tot_occurence1 Attributes_tot
	rename unique_count1 Attributes_uniq
	rename tot_occurence2 EconOp_tot
	rename unique_count2 EconOp_uniq
	rename tot_occurence3 FctWords_tot
	rename unique_count3 FctWords_uniq
	rename tot_occurence4 LegalRef_tot
	rename unique_count4 LegalRef_uniq
	rename tot_occurence5 LogicalConn_tot
	rename unique_count5 LogicalConn_uniq
	rename tot_occurence6 MathOp_tot
	rename unique_count6 MathOp_uniq
	rename tot_occurence7 Other_tot
	rename unique_count7 Other_uniq
	rename tot_occurence8 RegOp_tot
	rename unique_count8 RegOp_uniq

	foreach var of varlist Attributes_tot-RegOp_uniq {
		replace `var' = 0 if `var' == .
	}

// merge entity names
merge 1:1 id_name using id_name_entity.dta
	keep if _merge == 3
	drop _merge
save tmp3.dta, replace

merge 1:1 name_entity using Bankers_Data.dta 
	drop if _merge != 3
	drop _merge
	
merge 1:1 name_entity using ../Bankers_Cells.dta
	keep if _merge == 3
	drop _merge
	
	rename cells Cells_tot
	gen Cells_uniq = 1
	
	
	replace RegOp_tot = RegOp_tot + Cells_tot
	replace RegOp_uniq = RegOp_uniq + Cells_uniq
	
save ../../Datasets/EBA_Master.dta, replace

use tmp3.dta, clear 
merge 1:1 name_entity using Regulators_Data.dta 
	drop if _merge != 3
	drop _merge
replace Attributes_uniq = 0 if Attributes_uniq==. //C77 has no attributes, which returns a missing value instead of a 0
replace Attributes_tot = 0 if Attributes_tot==.

merge 1:1 name_entity using ../Regulators_Cells.dta
	keep if _merge == 3
	drop _merge
	
	rename cells Cells_tot
	gen Cells_uniq = 1
	replace RegOp_tot = RegOp_tot + Cells_tot
	replace RegOp_uniq = RegOp_uniq + Cells_uniq
	
save ../../Datasets/EBA_Master_Regulators.dta, replace

erase tmp.dta 
erase tmp2.dta
erase tmp3.dta
erase ../EBA_Dictionary.dta
erase id_name_entity.dta
erase Bankers_Data.dta
erase Regulators_Data.dta 
erase ../Bankers_Cells.dta 
erase ../Regulators_Cells.dta
