Skip to content
Snippets Groups Projects
Select Git revision
  • ef0c86eeb4040969c008d5f6dacd19ff701c7668
  • master default
  • object
  • develop protected
  • private_algos
  • cuisine
  • SMOTE
  • revert-76c4cca5
  • archive protected
  • no_graphviz
  • 0.0.1
11 results

LateFusion.py

Blame
  • fulltext_to_conll07.py 1.32 KiB
    import sys, re
    import xml.etree.ElementTree as ET
    from fulltext_to_elements import find, find_first
    
    ns = '{http://framenet.icsi.berkeley.edu}'
    
    # find a namespace-prefixed xml element, optionally filtered by attrib name/value
    def find(element, path, attrib = {}):
        path = re.sub(r'/([^/.])', '/' + ns + r'\1', path)
        output = []
        for node in element.findall(path):
            skip = False
            for name, value in attrib.items():
                if name not in node.attrib or (value != None and node.attrib[name] != value):
                    skip = True
                    break
            if not skip:
                output.append(node)
        return output
    
    def find_first(element, path, attrib = {}):
        output = find(element, path, attrib)
        return output[0] if len(output) > 0 else None
    
    sentence_id = 0
    
    def process_fulltext_xml(filename):
        global sentence_id
        fp = open(filename)
        root = ET.parse(fp).getroot()
        fp.close()
    
        for sentence in find(root, './/sentence'):
            text = find_first(sentence, './text').text
            for word_id, word in enumerate(text.strip().split()):
                print '\t'.join([str(word_id + 1), word] + ['_'] * 10)
            print
            sentence_id += 1
    
    if __name__ == '__main__':
        for filename in sorted(sys.argv[1:]):
            print >>sys.stderr, filename
            process_fulltext_xml(filename)