Skip to content
Snippets Groups Projects
Commit 29b7b6c4 authored by Franck Dary's avatar Franck Dary
Browse files

Added script to append column containing lexicon pos information into conllu file

parent 2f3c0537
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
# Take as input the lefff lexicon and conllu files.
# For each word form it will compute the possible POS.
# Then it will add a column encoding these possible POS to output conllu files.
import sys
import argparse
from readMCD import readMCD
# List of UD POS tags : https://universaldependencies.org/u/pos/index.html
allPos = ["adj", "adp", "adv", "aux", "cconj", "det", "intj", "noun", "num", "part", "pron", "propn", "punct", "sconj", "sym", "verb", "x"]
# Convert lefff part of speech into UD UPOS.
lefffPOS2UD = {
"adj" : "adj",
"csu" : "sconj",
"que" : "sconj", # Not only ?
"det" : "det",
"pres" : "intj", # Nothing match ? INTJ or X
"v" : "verb",
"nc" : "noun",
"cfi" : "noun",
"advPref" : "x", # No meaning with UD tokenization
"adjPref" : "x", # same
"suffAdj" : "x", # same
"cln" : "pron",
"ce" : "pron",
"clg" : "adp",
"cll" : "pron",
"ilimp" : "pron",
"cla" : "pron",
"cld" : "pron",
"pro" : "pron",
"caimp" : "pron",
"pri" : "adv",
"prel" : "pron",
"clr" : "pron",
"clar" : "pron",
"cldr" : "pron",
"adv" : "adv",
"advm" : "adv",
"advp" : "adv",
"coo" : "cconj",
"ponctw" : "punct",
"advneg" : "adv",
"clneg" : "adv",
"que_restr" : "sconj",
"np" : "propn",
"poncts" : "punct",
"parento" : "punct",
"epsilon" : "punct",
"parentf" : "punct",
"prep" : "adp",
"auxAvoir" : "aux",
"auxEtre" : "aux",
}
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("--lefff", type=str,
help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.")
parser.add_argument("--conllu", nargs="+", type=str,
help="Input conllu files, to find possible POS for each word.")
parser.add_argument("--output", nargs="+", type=str,
help="Output conllu files. Must be existing conllu files, this script adds a new column in place.")
parser.add_argument("--colName", type=str, default="LEXICON",
help="Name of the column that will be added by the script. If the column already exists, it will be replaced.")
args = parser.parse_args()
if args.lefff is None and args.conllu is None :
print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr)
exit(1)
if args.output is None :
print("ERROR: must provide --output", file=sys.stderr)
exit(1)
# Dict with key=FORM and value= dict associationg pos with number of occ
form2pos = {}
# Associate each form with a counter, only for conllu files
formCount = {}
# Read lefff and populate form2pos with # of occ = 1
if args.lefff is not None :
for line in open(args.lefff, "r") :
splited = line.strip().split("\t")
form = splited[0].lower()
pos = lefffPOS2UD[splited[1]]
# In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
form = form.replace(" ", "")
if " " in form :
print("HERE '%s'"%form, file=sys.stderr)
if pos not in allPos :
print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr)
if form not in form2pos :
form2pos[form] = {}
if form not in formCount :
formCount[form] = 0
if pos not in form2pos[form] :
form2pos[form][pos] = 1
# If conllu files are provided, count number of occurences into form2pos
if args.conllu is not None :
if args.conllu is not None :
for filename in args.conllu :
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
conllMCD, conllMCDr = readMCD(baseMCD)
for line in open(filename, "r") :
line = line.strip()
if "global.columns =" in line and line[0] == "#" :
splited = line.split("global.columns =")
conllMCD, conllMCDr = readMCD(splited[-1].strip())
continue
if len(line) == 0 or line[0] == "#" :
continue
splited = line.split("\t")
wordId = splited[conllMCD["ID"]].lower()
if "-" in wordId :
continue
form = splited[conllMCD["FORM"]].lower()
pos = splited[conllMCD["UPOS"]].lower()
form = form.replace(" ", "")
if pos not in allPos :
print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr)
if form not in form2pos :
form2pos[form] = {}
if pos not in form2pos[form] :
form2pos[form][pos] = 0
form2pos[form][pos] += 1
if form not in formCount :
formCount[form] = 0
formCount[form] += 1
# Reshape form2pos to be form -> pos set as string (ex. adj|verb)
for form in form2pos :
posSetStr = "|".join([pos for pos in form2pos[form]])
form2pos[form] = posSetStr
# Read all output conllu files and rewrite them in place with the new column
for filename in args.output :
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
conllMCD, conllMCDr = readMCD(baseMCD)
if args.colName not in conllMCD :
conllMCD[args.colName] = len(conllMCD)
conllMCDr[len(conllMCDr)] = args.colName
newLines = ["# global.columns = %s"%(" ".join([conllMCDr[i] for i in range(len(conllMCDr))]))]
for line in open(filename, "r") :
line = line.strip()
if "global.columns =" in line and line[0] == "#" :
splited = line.split("global.columns =")
conllMCD, conllMCDr = readMCD(splited[-1].strip())
if args.colName not in conllMCD :
conllMCD[args.colName] = len(conllMCD)
conllMCDr[len(conllMCDr)] = args.colName
newLines = ["# global.columns = %s"%(" ".join([conllMCDr[i] for i in range(len(conllMCDr))]))]
continue
if len(line) == 0 or line[0] == "#" :
newLines.append(line)
continue
splited = line.split("\t")
form = splited[conllMCD["FORM"]].lower()
form = form.replace(" ", "")
posSetStr = form2pos.get(form, "none")
newColIndex = conllMCD[args.colName]
if newColIndex not in range(len(splited)) :
splited.append("")
splited[newColIndex] = posSetStr
newLines.append("\t".join(splited))
print("\n".join(newLines), file=open(filename, "w"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment