# -*- coding: utf-8 -*-
# Maïeul ROUQUETTE ; Annette von STOCKHAUSEN
# GPL 3
# https://www.gnu.org/licenses/gpl-3.0.html
# Ce script permet de transformer des textes issu du TLG en texte utilisable en LaTeX :
#    - suppression des numeros de lines
#    - suppression des césures
#    - remplacement des guillemets par des \enquote{}
# Version 2.6.0
import re
import os
import default as config
import unicodedata
global stanza
stanza = False # set to True when verses starts
def normaliser_fichier(fichier):
	'''Normalise un fichier'''
	import codecs
	finale = ''
	debut_phrase = True
	file = codecs.open(fichier,encoding='utf-8')
	for line in file:
		if line not in config.empty_line_r:
			finale = finale + normalize_line(line)+'\n'
		else:
			finale	= finale + config.empty_line_w
	
	# correct end stanza
	finale = finale.replace(config.between_stanza_w+"\n"+config.after_stanza_w+"\n","\n"+config.after_stanza_w+"\n")
	
		
	file.close()
	if os.path.dirname(fichier)=="":
	    destination = "normal_" + os.path.basename(fichier)
	else:
	    destination = os.path.dirname(fichier) + os.sep + "normal_" + os.path.basename(fichier)
	file = codecs.open(destination,encoding='utf-8',mode='w')
	file.write(finale)
	file.close()

def make_regexp_linenumber_hyphen():
	'''Make a regexp containing regexp for the line number preceding by the regexp for the hyphen.
	Return also the corresponding regexp replacement'''
	line_number_r,line_number_w = config.line_number_r,config.line_number_w

	# Prepare the regexp
	line_number_r = "([" + "".join(config.hyphen) + "]?)" + "[\s]*" + line_number_r

	# Prepare the regexp replacement
	line_number_w = line_number_w.replace("1","2") + "\\1"

	return line_number_r,line_number_w


def normalize_line(line):
	'''Normalize one line'''


	line_number_r,line_number_w = make_regexp_linenumber_hyphen()
	line = re.sub(line_number_r,line_number_w,line) 	# change of line number

	
	# are we at the begining of a new paragraph
	if re.match(config.par_break_r,line):
		paragraph = True
	else:
		paragraph = False
	
	# for stanza
	global stanza
	stanza_start = False
	stanza_end = False
	
	if re.match(config.before_stanza_r,line):
		if stanza == False:
			stanza = True
			stanza_start = True
		else:
			stanza = False
			stanza_end = True
		
	
	line = line.strip()			# suppression des espaces de début et fin
	# hyphenation
	try:
		if line[-1] in config.hyphen:	
			line = line[:-1] + "%"
	except:
		pass

	
	# les guillemets
	line = re.sub(config.ellipsis,r"\1'",line) # replace ’ in Ellipsis with ', otherwise not discernable from single endquote
	line = re.sub(config.begin_quote_r,config.begin_quote_w,line)
	line = re.sub(config.end_quote_r,config.end_quote_w,line)
	line = re.sub("\'",config.ellipsis_back,line) # replace ’ back	
	
	#tiret
	line = re.sub(config.ndash_r,config.ndash_w,line)
	
	#insert
	line = re.sub(config.begin_insert_r,config.begin_insert_w,line)
	line = re.sub(config.end_insert_r,config.end_insert_w,line)
	
	# chapters and paragraphs
	line = re.sub(config.paragraph_r,config.paragraph_w,line) # paragraph number
	line = re.sub(config.chapter_r,config.chapter_w,line) #chapter number  
	
	# paragraph begining:
	if paragraph:
		line = config.par_break_w + line
	
	# last series of regexp
	if config.last_regexp:
		for regexp in config.last_regexp:
		    line = re.sub(regexp[0],regexp[1],line)
	
	# stanza
	if stanza_end:
		line = config.after_stanza_w
	elif stanza_start:
		line = config.before_stanza_w
	elif stanza :
		line = line + config.between_stanza_w
	
	# Unicode normalization
	if config.unicode_normalize:
	    line = unicodedata.normalize(config.unicode_normalize,line)
	return line

def test():
	"""Be sur any modification doesn't break compatibilty"""
	test = os.listdir("test") 	 #All the file of the test directory.
	import hashlib
	for file in test:
		if file[0] in ["0","1","2","3","4","5","6","7","8","9"]: #If it's a file to be tested.
		    md5 = hashlib.md5(open("test" + os.sep + "normal_" + file,"rb").read()).hexdigest()
		    normaliser_fichier("test" + os.sep + file)
		    
		    if md5 !=hashlib.md5(open("test" + os.sep + "normal_" + file,"rb").read()).hexdigest():
			    print ("Error on file" + file)
			    
		    else:
			    print ("File "+file+ " OK")

def __main__():
	import sys
	import getopt
	option = getopt.getopt(sys.argv[1:],'')[1]
	if option == ['test']:
		test()
		sys.exit()
	else:
		for fichier in option:
			try:
			    normaliser_fichier(fichier)
			    print (fichier + " normalisé")
			except Exception as e:
			    print ("Can't normalize "+ fichier + " "+ str(e))
		sys.exit()
	

__main__()