Skip to content
Snippets Groups Projects
Commit 5c1b287c authored by Franck Dary's avatar Franck Dary
Browse files

Added script to create word embeddings based on what POS can a word have

parent 0574caf5
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
# Create a w2v formatted embedding file.
# Each line associate a lowercase word with an embedding whose dimmensions are the UD POS.
# The input to this script is a combination of lefff lexicon and conllu UD corpora.
# Example: ./lefff2w2v --lefff lefff.fplm --conllu data/UD_French-GSD/*\.conllu
# Example: ./lefff2w2v --lefff lefff.fplm
# Example: ./lefff2w2v --conllu data/UD_French-GSD/*\.conllu
# We can chose to output binary vector with the option --binary which is a threshold above which values will become 1.
# We can ignore infrequent words in conllu by setting a threshold with --minfreq.
import sys
import argparse
from readMCD import readMCD
# Convert lefff part of speech into UD UPOS.
lefffPOS2UD = {
"adj" : "adj",
"csu" : "sconj",
"que" : "sconj", # Not only ?
"det" : "det",
"pres" : "intj", # Nothing match ? INTJ or X
"v" : "verb",
"nc" : "noun",
"cfi" : "noun",
"advPref" : "x", # No meaning with UD tokenization
"adjPref" : "x", # same
"suffAdj" : "x", # same
"cln" : "pron",
"ce" : "pron",
"clg" : "adp",
"cll" : "pron",
"ilimp" : "pron",
"cla" : "pron",
"cld" : "pron",
"pro" : "pron",
"caimp" : "pron",
"pri" : "adv",
"prel" : "pron",
"clr" : "pron",
"clar" : "pron",
"cldr" : "pron",
"adv" : "adv",
"advm" : "adv",
"advp" : "adv",
"coo" : "cconj",
"ponctw" : "punct",
"advneg" : "adv",
"clneg" : "adv",
"que_restr" : "sconj",
"np" : "propn",
"poncts" : "punct",
"parento" : "punct",
"epsilon" : "punct",
"parentf" : "punct",
"prep" : "adp",
"auxAvoir" : "aux",
"auxEtre" : "aux",
}
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("--lefff", type=str,
help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.")
parser.add_argument("--conllu", nargs="+", type=str,
help="Conllu files to estimate the probability of each POS.")
parser.add_argument("--binary", type=float,
help="A threshold in [0,1] that will separate zeroes from ones.")
parser.add_argument("--minfreq", type=int,
help="A threshold in number of occurrences of words.")
parser.add_argument("--lefffWeight", type=int, default=1,
help="What is the weight, in number of occurrences of the couple (form,POS) in annotated conllu data, that the lefff add ?")
args = parser.parse_args()
if args.lefff is None and args.conllu is None :
print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr)
exit(1)
# Dict with key=FORM and value= dict associationg pos with number of occ
form2pos = {}
# List of all pos (UD format) present in data
allPos = []
# Associate each form with a counter, only for conllu files
formCount = {}
# Read lefff and populate form2pos with # of occ = 1
if args.lefff is not None :
for line in open(args.lefff, "r") :
splited = line.strip().split("\t")
form = splited[0].lower()
pos = lefffPOS2UD[splited[1]]
# In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
form.replace(" ", "")
if pos not in allPos :
allPos.append(pos)
if form not in form2pos :
form2pos[form] = {}
if pos not in form2pos[form] :
form2pos[form][pos] = args.lefffWeight
# If conllu files are provided, count number of occurences into form2pos
if args.conllu is not None :
if args.conllu is not None :
for filename in args.conllu :
baseMCD = "ID FORM LEMMA POS XPOS FEATS HEAD DEPREL"
conllMCD, conllMCDr = readMCD(baseMCD)
for line in open(filename, "r") :
line = line.strip()
if "global.columns =" in line and line[0] == "#" :
splited = line.split("global.columns =")
conllMCD, conllMCDr = readMCD(splited[-1].strip())
continue
if len(line) == 0 or line[0] == "#" :
continue
splited = line.split("\t")
form = splited[conllMCD["FORM"]].lower()
pos = splited[conllMCD["UPOS"]].lower()
form.replace(" ", "")
if pos not in allPos :
allPos.append(pos)
if form not in form2pos :
form2pos[form] = {}
if pos not in form2pos[form] :
form2pos[form][pos] = 0
form2pos[form][pos] += 1
if form not in formCount :
formCount[form] = 0
formCount[form] += 1
outputLines = []
# Compute probability for each pos and form
for form in form2pos :
if args.minfreq is not None and formCount[form] < args.minfreq :
continue
vec = ["0" for _ in allPos]
totalOccs = 0
for pos in form2pos[form] :
totalOccs += form2pos[form][pos]
for pos in form2pos[form] :
vec[allPos.index(pos)] = form2pos[form][pos] / totalOccs
baseVec = vec.copy()
for pos in form2pos[form] :
if args.binary is not None :
if vec[allPos.index(pos)] >= args.binary :
vec[allPos.index(pos)] = 1
else :
vec[allPos.index(pos)] = 0
if args.binary is not None :
vec[allPos.index(pos)] = "%d"%vec[allPos.index(pos)]
else :
vec[allPos.index(pos)] = "%.2f"%vec[allPos.index(pos)]
if sum(map(float, vec)) == 0 :
print("WARNING: word '%s' gets all 0. Original: '%s'"%(form, " ".join(map(str,baseVec))), file=sys.stderr)
outputLines.append(form+" "+" ".join(vec))
# Print the w2v file
print(len(outputLines), len(allPos))
outputLines.sort()
print("\n".join(outputLines))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment