Skip to content
Snippets Groups Projects
Commit 0450dd37 authored by Franck Dary's avatar Franck Dary
Browse files

Removed duplicates in fplm generation, and made it deterministic

parent 804c33a6
No related branches found
No related tags found
No related merge requests found
......@@ -30,6 +30,9 @@ if __name__ == "__main__" :
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
entriesSet = set()
entriesList = []
for line in open(sys.argv[1], "r", encoding="utf8") :
if len(line.strip()) < 3 :
continue
......@@ -40,8 +43,16 @@ if __name__ == "__main__" :
if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
continue
print(columns[int(conllMCDr["FORM"])],end='\t')
print(columns[int(conllMCDr["POS"])],end='\t')
print(columns[int(conllMCDr["LEMMA"])],end='\t')
print(columns[int(conllMCDr["MORPHO"])],end='\n')
entry = ""
for col in ["FORM", "POS", "LEMMA", "MORPHO"] :
entry = entry + columns[int(conllMCDr[col])] + '\t'
entry = entry[:-1]
if entry not in entriesSet :
entriesSet.add(entry)
entriesList.append(entry)
entriesList.sort()
for entry in entriesList :
print(entry)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment