Select Git revision
LateFusion.py
-
Baptiste Bauvin authoredBaptiste Bauvin authored
fulltext_to_conll07.py 1.32 KiB
import sys, re
import xml.etree.ElementTree as ET
from fulltext_to_elements import find, find_first
ns = '{http://framenet.icsi.berkeley.edu}'
# find a namespace-prefixed xml element, optionally filtered by attrib name/value
def find(element, path, attrib = {}):
path = re.sub(r'/([^/.])', '/' + ns + r'\1', path)
output = []
for node in element.findall(path):
skip = False
for name, value in attrib.items():
if name not in node.attrib or (value != None and node.attrib[name] != value):
skip = True
break
if not skip:
output.append(node)
return output
def find_first(element, path, attrib = {}):
output = find(element, path, attrib)
return output[0] if len(output) > 0 else None
sentence_id = 0
def process_fulltext_xml(filename):
global sentence_id
fp = open(filename)
root = ET.parse(fp).getroot()
fp.close()
for sentence in find(root, './/sentence'):
text = find_first(sentence, './text').text
for word_id, word in enumerate(text.strip().split()):
print '\t'.join([str(word_id + 1), word] + ['_'] * 10)
print
sentence_id += 1
if __name__ == '__main__':
for filename in sorted(sys.argv[1:]):
print >>sys.stderr, filename
process_fulltext_xml(filename)