#!/usr/local/bin/python3
# -*- coding: utf-8 -*-
# OPTIMIZED FOR OS X

import sys
import os
import re

# ###########################################################################
# METHODS
# ###########################################################################

# THE CLEANING HERE NEEDS TO WORK IN THE SAME WAY AS IN THE N-GRAM CLASSIFICATION CODE
def clean(value):
    value = re.sub(r"[()'\",]", "", value)
    value = value.lower()

    return value


# -------------------------------------------------------------------------
# do_run(file_name)
# -------------------------------------------------------------------------
def do_run(input_dir, dict_file_name, output_dir):
    dict = {}

    print("<<< process_text.py")

    #
    # LOAD CONSOLIDATED DICTIONARY
    #
    dict_file = open(dict_file_name, "r")
    dict_file.readline()  # ignore header 
    for line in dict_file.readlines():
        tokens = line.strip().split(";")
        dict[tokens[0].strip('"')] = tokens[1].strip('"').strip().lower()
    print("  <<< READ DICTIONARY: " + dict_file_name + " WITH " + str(len(dict)) + " ENTRIES")

    if False:
        for key in sorted(dict, key=len, reverse=True):
            print(key, "-->", dict[key])

    #
    # LOOP OVER ALL FILES IN DIRECTORY AND FIND OCCURRENCE OF EACH DICT ENTRY
    #
    for input_file_name in os.listdir(input_dir):
        in_text = ""
        out_text = ""

        try:
            in_file_name = input_dir + input_file_name
            out_file = open(output_dir + "/" + "cons-count_" + input_file_name.rstrip(".txt") + ".csv", 'w')
            residual_file = open(output_dir + "/" + "residual_cons-count_" + input_file_name.rstrip(".txt") + ".txt", 'w')

            print("  <<< WORKING ON: " + in_file_name)

            # Step 2: Read the contents of the input file
            with open(in_file_name, 'r', encoding='utf-8', errors='replace') as file:
                in_text = file.read().lower()
            
            # clean the in_text
            in_text = in_text.replace("en  .", " ")
            in_text = in_text.replace("en.  ", " ")

            print("    << TEXT LENGTH BEFORE REMOVING:", len(in_text))
            
            for key in sorted(dict, key=len, reverse=True):
                pattern = r'\b' + re.escape(key) + r'[\s.,:;!?\'"`]*\b'
                matches = re.findall(pattern, in_text)
                _count = len(matches)
                if _count > 0:
                    out_text += input_file_name + ";" + key + ";" + dict[key] + ";" + str(_count) + "\n"
                    in_text = re.sub(pattern, " ", in_text)

            print("    << TEXT LENGTH AFTER REMOVING: ", len(in_text))

            out_file.write(out_text)
            out_file.close()
            
            residual_file.write(in_text)
            residual_file.close()
        except IsADirectoryError:
            print("<<< SKIPPED:" + input_dir + "/" + input_file_name)

# -------------------------------------------------------------------------


# -------------------------------------------------------------------------
#
#  MAIN
#
# -------------------------------------------------------------------------
if __name__ == '__main__':
#
# VARIABLES
#
    args = sys.argv
    input_dir = args[1]
    dict_file_name = args[2]
    output_file_name = args[3]

#
# CODE
#
    do_run(input_dir, dict_file_name, output_file_name)
