From 5c1b287cc259dae3796c879ad33beb20985b8502 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 19 May 2022 15:01:13 +0200 Subject: [PATCH] Added script to create word embeddings based on what POS can a word have --- scripts/lefff2w2v.py | 161 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100755 scripts/lefff2w2v.py diff --git a/scripts/lefff2w2v.py b/scripts/lefff2w2v.py new file mode 100755 index 0000000..6a1c46d --- /dev/null +++ b/scripts/lefff2w2v.py @@ -0,0 +1,161 @@ +#! /usr/bin/env python3 +# Create a w2v formatted embedding file. +# Each line associate a lowercase word with an embedding whose dimmensions are the UD POS. +# The input to this script is a combination of lefff lexicon and conllu UD corpora. +# Example: ./lefff2w2v --lefff lefff.fplm --conllu data/UD_French-GSD/*\.conllu +# Example: ./lefff2w2v --lefff lefff.fplm +# Example: ./lefff2w2v --conllu data/UD_French-GSD/*\.conllu +# We can chose to output binary vector with the option --binary which is a threshold above which values will become 1. +# We can ignore infrequent words in conllu by setting a threshold with --minfreq. + +import sys +import argparse +from readMCD import readMCD + +# Convert lefff part of speech into UD UPOS. +lefffPOS2UD = { + "adj" : "adj", + "csu" : "sconj", + "que" : "sconj", # Not only ? + "det" : "det", + "pres" : "intj", # Nothing match ? INTJ or X + "v" : "verb", + "nc" : "noun", + "cfi" : "noun", + "advPref" : "x", # No meaning with UD tokenization + "adjPref" : "x", # same + "suffAdj" : "x", # same + "cln" : "pron", + "ce" : "pron", + "clg" : "adp", + "cll" : "pron", + "ilimp" : "pron", + "cla" : "pron", + "cld" : "pron", + "pro" : "pron", + "caimp" : "pron", + "pri" : "adv", + "prel" : "pron", + "clr" : "pron", + "clar" : "pron", + "cldr" : "pron", + "adv" : "adv", + "advm" : "adv", + "advp" : "adv", + "coo" : "cconj", + "ponctw" : "punct", + "advneg" : "adv", + "clneg" : "adv", + "que_restr" : "sconj", + "np" : "propn", + "poncts" : "punct", + "parento" : "punct", + "epsilon" : "punct", + "parentf" : "punct", + "prep" : "adp", + "auxAvoir" : "aux", + "auxEtre" : "aux", +} + +if __name__ == "__main__" : + parser = argparse.ArgumentParser() + parser.add_argument("--lefff", type=str, + help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.") + parser.add_argument("--conllu", nargs="+", type=str, + help="Conllu files to estimate the probability of each POS.") + parser.add_argument("--binary", type=float, + help="A threshold in [0,1] that will separate zeroes from ones.") + parser.add_argument("--minfreq", type=int, + help="A threshold in number of occurrences of words.") + parser.add_argument("--lefffWeight", type=int, default=1, + help="What is the weight, in number of occurrences of the couple (form,POS) in annotated conllu data, that the lefff add ?") + + args = parser.parse_args() + + if args.lefff is None and args.conllu is None : + print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr) + exit(1) + + # Dict with key=FORM and value= dict associationg pos with number of occ + form2pos = {} + # List of all pos (UD format) present in data + allPos = [] + # Associate each form with a counter, only for conllu files + formCount = {} + + # Read lefff and populate form2pos with # of occ = 1 + if args.lefff is not None : + for line in open(args.lefff, "r") : + splited = line.strip().split("\t") + form = splited[0].lower() + pos = lefffPOS2UD[splited[1]] + # In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle. + form.replace(" ", "◌") + if pos not in allPos : + allPos.append(pos) + if form not in form2pos : + form2pos[form] = {} + if pos not in form2pos[form] : + form2pos[form][pos] = args.lefffWeight + + # If conllu files are provided, count number of occurences into form2pos + if args.conllu is not None : + if args.conllu is not None : + for filename in args.conllu : + baseMCD = "ID FORM LEMMA POS XPOS FEATS HEAD DEPREL" + conllMCD, conllMCDr = readMCD(baseMCD) + for line in open(filename, "r") : + line = line.strip() + if "global.columns =" in line and line[0] == "#" : + splited = line.split("global.columns =") + conllMCD, conllMCDr = readMCD(splited[-1].strip()) + continue + if len(line) == 0 or line[0] == "#" : + continue + splited = line.split("\t") + form = splited[conllMCD["FORM"]].lower() + pos = splited[conllMCD["UPOS"]].lower() + form.replace(" ", "◌") + if pos not in allPos : + allPos.append(pos) + if form not in form2pos : + form2pos[form] = {} + if pos not in form2pos[form] : + form2pos[form][pos] = 0 + form2pos[form][pos] += 1 + if form not in formCount : + formCount[form] = 0 + formCount[form] += 1 + + outputLines = [] + + # Compute probability for each pos and form + for form in form2pos : + if args.minfreq is not None and formCount[form] < args.minfreq : + continue + vec = ["0" for _ in allPos] + totalOccs = 0 + for pos in form2pos[form] : + totalOccs += form2pos[form][pos] + for pos in form2pos[form] : + vec[allPos.index(pos)] = form2pos[form][pos] / totalOccs + baseVec = vec.copy() + for pos in form2pos[form] : + if args.binary is not None : + if vec[allPos.index(pos)] >= args.binary : + vec[allPos.index(pos)] = 1 + else : + vec[allPos.index(pos)] = 0 + if args.binary is not None : + vec[allPos.index(pos)] = "%d"%vec[allPos.index(pos)] + else : + vec[allPos.index(pos)] = "%.2f"%vec[allPos.index(pos)] + if sum(map(float, vec)) == 0 : + print("WARNING: word '%s' gets all 0. Original: '%s'"%(form, " ".join(map(str,baseVec))), file=sys.stderr) + outputLines.append(form+" "+" ".join(vec)) + + # Print the w2v file + print(len(outputLines), len(allPos)) + outputLines.sort() + print("\n".join(outputLines)) + -- GitLab