Skip to content
Snippets Groups Projects
Commit b234aaad authored by Carlos Ramisch's avatar Carlos Ramisch
Browse files

Update README and add function to remove subrelations in simplify_sequoia.py...

Update README and add function to remove subrelations in simplify_sequoia.py (not used for the moment)
parent b19ae9bc
Branches
No related tags found
No related merge requests found
...@@ -7,7 +7,6 @@ We obtained the file `trunk/sequoia-ud.parseme.frsemcor` from commit number `ea7 ...@@ -7,7 +7,6 @@ We obtained the file `trunk/sequoia-ud.parseme.frsemcor` from commit number `ea7
The file is the result of the conversion from Sequoia's source as described on the [documentation](https://deep-sequoia.inria.fr/process/) The file is the result of the conversion from Sequoia's source as described on the [documentation](https://deep-sequoia.inria.fr/process/)
We keep the original file in `src` folder to make command line completion faster We keep the original file in `src` folder to make command line completion faster
The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises.
### Simplification ### Simplification
...@@ -51,6 +50,8 @@ CUTLINE=`grep -n ${LASTID} ${CORPUS} | sed 's/:.*//g'` ...@@ -51,6 +50,8 @@ CUTLINE=`grep -n ${LASTID} ${CORPUS} | sed 's/:.*//g'`
head -n $((CUTLINE-1)) ${CORPUS} > sequoia-ud.parseme.frsemcor.simple.small head -n $((CUTLINE-1)) ${CORPUS} > sequoia-ud.parseme.frsemcor.simple.small
``` ```
The file `tiny.conllu` was manually extracted and simplified, it is used in parsing exercises.
Finally, we also split the non-simplified version of the corpus into train, dev and test (before simplification). Finally, we also split the non-simplified version of the corpus into train, dev and test (before simplification).
These files should not be used in your experiments. These files should not be used in your experiments.
``` ```
......
...@@ -23,6 +23,7 @@ Simplify Sequoia corpus for pedagogical purposes: ...@@ -23,6 +23,7 @@ Simplify Sequoia corpus for pedagogical purposes:
- Columns HEAD and DEPREL (TP5 and TP6) - Columns HEAD and DEPREL (TP5 and TP6)
- Remove non-projective sentences - Remove non-projective sentences
=> Non-projective parse trees are not straightforward to handle in the dependency parsing models we implement => Non-projective parse trees are not straightforward to handle in the dependency parsing models we implement
- [EXPERIMENTAL] Remove all deprel subrelations (after semicolon) to simplify the tagset
This script depends on the `cuptlib` library. You can install it with: This script depends on the `cuptlib` library. You can install it with:
...@@ -125,13 +126,21 @@ def is_projective(sent): ...@@ -125,13 +126,21 @@ def is_projective(sent):
######################################### #########################################
def remove_subrelations(sent):
subrel_counter = sum([1 if ':' in t['deprel'] else 0 for t in sent])
for token in sent :
token['deprel'] = re.sub(':.*', '', token['deprel'])
return subrel_counter
#########################################
if len(sys.argv) != 2: if len(sys.argv) != 2:
print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr) print('Usage: {} <input_corpus.conllu>'.format(sys.argv[0]), file=sys.stderr)
exit(-1) exit(-1)
with open(sys.argv[1], "r", encoding="UTF=8") as f: with open(sys.argv[1], "r", encoding="UTF=8") as f:
np_counter = range_counter = del_ne_counter = 0 np_counter = range_counter = del_ne_counter = 0
del_ssense_counter = mod_ssense_counter = 0 del_ssense_counter = mod_ssense_counter = 0 #subrel_counter = 0
np_ids = [] np_ids = []
for sent in conllu.parse_incr(f): for sent in conllu.parse_incr(f):
range_counter = range_counter + remove_range_tokens(sent) range_counter = range_counter + remove_range_tokens(sent)
...@@ -139,6 +148,7 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f: ...@@ -139,6 +148,7 @@ with open(sys.argv[1], "r", encoding="UTF=8") as f:
del_ssense_counter = del_ssense_counter + del_ssense_ci del_ssense_counter = del_ssense_counter + del_ssense_ci
mod_ssense_counter = mod_ssense_counter + mod_ssense_ci mod_ssense_counter = mod_ssense_counter + mod_ssense_ci
del_ne_counter = del_ne_counter + simplify_mwe_ne(sent) del_ne_counter = del_ne_counter + simplify_mwe_ne(sent)
# subrel_counter = subrel_counter + remove_subrelations(sent)
if is_projective(sent) : # Returns false to remove sentence if is_projective(sent) : # Returns false to remove sentence
if sent.metadata.get("global.columns", None): # Add header for new column if sent.metadata.get("global.columns", None): # Add header for new column
sent.metadata["global.columns"] += " PARSEME:NE" sent.metadata["global.columns"] += " PARSEME:NE"
...@@ -154,6 +164,6 @@ print( "{} discontinuous and overlapping NEs removed.\n".format(del_ne_counter), ...@@ -154,6 +164,6 @@ print( "{} discontinuous and overlapping NEs removed.\n".format(del_ne_counter),
print( "{} supersense tags removed (on MWEs or strange POS).".format(del_ssense_counter), file=sys.stderr) print( "{} supersense tags removed (on MWEs or strange POS).".format(del_ssense_counter), file=sys.stderr)
print( "{} supersense tags modified (complex operators).\n".format(mod_ssense_counter), file=sys.stderr) print( "{} supersense tags modified (complex operators).\n".format(mod_ssense_counter), file=sys.stderr)
#print( "{} subrelations removed from deprel.".format(subrel_counter), file=sys.stderr)
print( "{} non-projective sentences removed:".format(np_counter), file=sys.stderr) print( "{} non-projective sentences removed:".format(np_counter), file=sys.stderr)
print(", ".join(np_ids), file=sys.stderr) print(", ".join(np_ids), file=sys.stderr)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment