Updated orfeo data selection

af40bbba · Franck Dary · fee91c64 · af40bbba · fee91c64 · af40bbba
Commit af40bbba authored 3 years ago by Franck Dary
--- a/fr_orpheo/data/Makefile
+++ b/fr_orpheo/data/Makefile
 ORFEO_DIR=../../data/fr_orpheo

 conllu : 
-	./prepareOrfeoData.py $(ORFEO_DIR)/mcf/ $(ORFEO_DIR)/meta_data/
-	./rmBlankLines.py train.conllu test.conllu
+	./getOrfeoTrainDevTest.py $(ORFEO_DIR)/mcf/ 0.15 0.15
 	~/macaon_data/scripts/conlluCheckProblems.py train.conllu > train 2> pbTrain
+	~/macaon_data/scripts/conlluCheckProblems.py dev.conllu > dev 2> pbDev
 	~/macaon_data/scripts/conlluCheckProblems.py test.conllu > test 2> pbTest
 	mv train train.conllu
+	mv dev dev.conllu
 	mv test test.conllu
-	~/oculometry/scripts/splitTrainDevTest.py --dev 0.1 --test 0.0 train.conllu
+	wc -l *\.conllu
+	wc -l pb*

 clean:
 	- rm *\.conll*
 	- rm pbTrain
+	- rm pbDev
 	- rm pbTest

--- a/fr_orpheo/data/conllu.mcd
+++ b/fr_orpheo/data/conllu.mcd
-0 FILENAME
-1 ID
-2 FORM
-3 LEMMA
-4 POS
-5 POS2
-6 EMPTY
-7 GOV
-8 LABEL
-9 EMPTY
-10 EMPTY
-11 TIME1
-12 TIME2
-13 SPKR
-14 NBLOCUTEURS
-15 MILIEU
-16 TYPE
-17 SECTEUR
--- a/fr_orpheo/data/getOrfeoTrainDevTest.py
+++ b/fr_orpheo/data/getOrfeoTrainDevTest.py
@@ -13,6 +13,8 @@ if __name__ == "__main__" :
  if len(sys.argv) != 4 :
    printUsageAndExit()

+  mcd = "# global.columns = FILE ID FORM LEMMA XPOS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER"
+
  random.seed(0)

  baseDir = sys.argv[1] + "/"
@@ -27,6 +29,8 @@ if __name__ == "__main__" :

    for line in open(baseDir+filename, "r") :
      line = line.strip()
+      # Remove consecutives blanklines
+      if len(line) > 0 or len(filesPerFamily[family][-1][1]) == 0 or len(filesPerFamily[family][-1][1][-1]) != 0 :
        filesPerFamily[family][-1][1].append(line)

  totalTrain = []
@@ -81,14 +85,15 @@ if __name__ == "__main__" :
      totalTest += filesPerFamily[family][i][1]

  with open("train.conllu", "w") as out :
-    print("\n".join(totalTrain), file=out)
+    print("\n".join([mcd]+totalTrain), file=out)
  with open("dev.conllu", "w") as out :
-    print("\n".join(totalDev), file=out)
+    print("\n".join([mcd]+totalDev), file=out)
  with open("test.conllu", "w") as out :
-    print("\n".join(totalTest), file=out)
+    print("\n".join([mcd]+totalTest), file=out)

  totalLines = len(totalTrain) + len(totalDev) + len(totalTest)
  print("Total : %d lines"%totalLines)
  print("Train : %d (%.2f%%)"%(len(totalTrain), 100.0*len(totalTrain)/totalLines))
  print("Dev : %d (%.2f%%)"%(len(totalDev), 100.0*len(totalDev)/totalLines))
  print("Test : %d (%.2f%%)"%(len(totalTest), 100.0*len(totalTest)/totalLines))
+
--- a/fr_orpheo/data/prepareOrfeoData.py
+++ b/fr_orpheo/data/prepareOrfeoData.py
-#! /usr/bin/python3
-
-import sys
-import os
-
-def mcd() :
-  return "# global.columns = FILE ID FORM LEMMA POS UPOS FEATS HEAD DEPREL NONE1 NONE2 TIME1 TIME2 SPEAKER NBLOCS TYPE MILIEU"
-
-def printUsageAndExit() :
-  print("USAGE : %s rawMcfDirectory metaDataDirectory"%(sys.argv[0]), file=sys.stderr)
-  exit(1)
-
-def cleanString(s) :
-  result = s.strip()
-
-  return result.replace("\"", "").replace("/>", "")
-
-def treatDirectory(mcfs, metadatas) :
-  trains = []
-  tests = []
-  metas = {}
-  features = ["nbLocuteurs", "milieu", "type", "secteur"]
-  featuresDecoda = ["2", "assistance", "finalise", "professionnel"]
-
-  for entry in os.listdir(mcfs) :
-    if os.path.isfile(os.path.join(mcfs, entry)) :
-      if entry.endswith(".train") :
-        trains.append(entry)
-      elif entry.endswith(".test") :
-        tests.append(entry)
-      else :
-        print("ERROR : unknown file %s"%entry, file=sys.stderr)
-        exit(1)
-
-  for entry in os.listdir(metadatas) :
-    if os.path.isfile(os.path.join(metadatas, entry)) :
-      splited = entry.split('.')
-      if len(splited) != 2 or splited[1] != "xml" :
-        continue
-      name = splited[0]
-      for line in open(metadatas+entry, "r") :
-        if "corresp" in line :
-          splited = line.split(' ')
-          target = ""
-          corresp = ""
-          for s in splited :
-            splited2 = s.split('=')
-            if len(splited2) != 2 :
-              continue
-            if splited2[0] == "target" :
-              target = cleanString(splited2[1])
-            elif splited2[0] == "corresp" :
-              corresp = cleanString(splited2[1])
-            else :
-              print("ERROR : wrong line \'%s\'."%line, file=sys.stderr)
-              exit(1)
-          if name not in metas :
-            metas[name] = {}
-          metas[name][corresp] = target
-
-  output = open("train.conllu", "w")
-
-  print(mcd(), file=output)
-  for mcf in trains :
-    featsForFile = list.copy(features)
-    name = mcf.split(".")[0]
-    if name not in metas :
-      if "RATP" in name :
-        featsForFile = featuresDecoda
-      else :
-        print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr)
-        exit(1)
-    if "RATP" not in name :
-      for i in range(len(features)) :
-        featValue = "n/a"
-        if features[i] in metas[name] :
-          featValue = metas[name][features[i]]
-
-        featsForFile[i] = featValue
-
-    for line in open(mcfs+mcf, "r") :
-      clean = line.strip()
-      if len(line) <= 2 :
-        print(file=output)
-        continue
-      completeLine = clean
-      for feat in featsForFile :
-        completeLine += "\t" + feat
-      print(completeLine,file=output)
-
-  output = open("test.conllu", "w")
-
-  print(mcd(), file=output)
-  for mcf in tests :
-    featsForFile = list.copy(features)
-    name = mcf.split(".")[0]
-    if name not in metas :
-      if "RATP" in name :
-        featsForFile = featuresDecoda
-      else :
-        print("ERROR : metadata unknown for file %s."%mcf, file=sys.stderr)
-        exit(1)
-    if "RATP" not in name :
-      for i in range(len(features)) :
-        featValue = "n/a"
-        if features[i] in metas[name] :
-          featValue = metas[name][features[i]]
-
-        featsForFile[i] = featValue
-
-    for line in open(mcfs+mcf, "r") :
-      clean = line.strip()
-      if len(line) <= 2 :
-        print(file=output)
-        continue
-      completeLine = clean
-      for feat in featsForFile :
-        completeLine += "\t" + feat
-      print(completeLine,file=output)
-
-def main() :
-  if len(sys.argv) != 3 :
-    printUsageAndExit()
-
-  treatDirectory(sys.argv[1], sys.argv[2])
-
-main()
--- a/fr_orpheo/data/rmBlankLines.py
+++ b/fr_orpheo/data/rmBlankLines.py
-#! /usr/bin/env python3
-
-import sys
-
-for filename in sys.argv[1:] :
-  lines = []
-  for line in open(filename, "r") :
-    line = line.strip()
-    if len(line) == 0 and len(lines) > 0 and len(lines[-1]) == 0 :
-      continue
-    lines.append(line)
-  
-  with open(filename, "w") as out :
-    for line in lines :
-      print(line, file=out)
-