From 2f3c05378e414676783e8c10ca24e1c704b5bf68 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Fri, 20 May 2022 14:18:58 +0200 Subject: [PATCH] fixed script to generate embeddings from lexicon --- scripts/lefff2w2v.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/lefff2w2v.py b/scripts/lefff2w2v.py index 53e0f93..1aa5e78 100755 --- a/scripts/lefff2w2v.py +++ b/scripts/lefff2w2v.py @@ -92,7 +92,9 @@ if __name__ == "__main__" : form = splited[0].lower() pos = lefffPOS2UD[splited[1]] # In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle. - form.replace(" ", "◌") + form = form.replace(" ", "◌") + if " " in form : + print("HERE '%s'"%form, file=sys.stderr) if pos not in allPos : print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr) if form not in form2pos : @@ -122,7 +124,7 @@ if __name__ == "__main__" : continue form = splited[conllMCD["FORM"]].lower() pos = splited[conllMCD["UPOS"]].lower() - form.replace(" ", "◌") + form = form.replace(" ", "◌") if pos not in allPos : print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr) if form not in form2pos : -- GitLab