Skip to content
Snippets Groups Projects
Commit 58078397 authored by Benoit Favre's avatar Benoit Favre
Browse files

initial commit

parents
Branches master
No related tags found
No related merge requests found
README 0 → 100644
python2 convert-mapping.py ../sensei-proto/src/synopsis-demo-2014/data/trs/*.trs > mapping-by-id.txt
echo "20091112_RATP_SCD_0203 spk5" | python2 speaker_type.py mapping-by-id.txt
empty:
20091112_RATP_SCD_0101
20091112_RATP_SCD_0501
20091112_RATP_SCD_1138
20091112_RATP_SCD_0887
20091112_RATP_SCD_1141
20091112_RATP_SCD_0101
20091112_RATP_SCD_0056
20091112_RATP_SCD_0290
20091112_RATP_SCD_0300
20101206_RATP_SCD_0074
20091112_RATP_SCD_1064
20101206_RATP_SCD_0416
20101206_RATP_SCD_0094
20101206_RATP_SCD_0230
20101206_RATP_SCD_0339
20091112_RATP_SCD_0797
out of service:
20101206_RATP_SCD_0449
20091112_RATP_SCD_0377
20101206_RATP_SCD_0051
import sys
from collections import defaultdict
speakers = defaultdict(list)
for line in sys.stdin:
label, name, shows = line.strip().split(';')
for show in shows.split(','):
speakers[show].append(label)
for show, names in speakers.items():
if 'caller' not in names:
print show, names
from xml.etree import ElementTree as ET
import sys, os
from collections import defaultdict
speakers = defaultdict(list)
mapping = defaultdict(lambda: defaultdict(str))
for line in open('mapping.txt'):
speaker_type, text, shows = line.strip().split(';')
for show in shows.split(','):
mapping[show][text] = speaker_type
for filename in sys.argv[1:]:
show = os.path.basename(filename).split('.')[0]
root = ET.parse(filename).getroot()
turns = root.findall(".//Turn")
for speaker in root.findall(".//Speaker"):
num_turns = 0
speaker_id = speaker.get("id")
speaker_type = mapping[show][speaker.get('name')]
if speaker_type != '':
print '\t'.join([show, speaker_id, speaker_type, speaker.get('name')])
from xml.etree import ElementTree as ET
import sys
from collections import defaultdict
speakers = defaultdict(list)
for filename in sys.argv[1:]:
root = ET.parse(filename).getroot()
turns = root.findall(".//Turn")
for speaker in root.findall(".//Speaker"):
num_turns = 0
speaker_id = speaker.get("id")
for turn in turns:
if 'speaker' in turn.attrib and speaker_id in turn.get("speaker").strip().split():
num_turns += 1
if num_turns > 0:
speakers[speaker.get("name")].append(filename.split("/")[-1][:-4])
for speaker, files in sorted(speakers.items(), key=lambda x: x[0]):
label = 'unknown'
if speaker in ['Apellant', 'Appelant', 'Appelant 1', 'Appelant 2', 'ami', 'appelan', 'appelant', 'appelant ', 'appelant 2', 'appelant-fille', 'appelant1', 'appelant2', 'appelant_F02', 'appellant', 'collègue', 'demandeur'] or 'appelant' in speaker.lower():
label = 'caller'
if 'conseiller' in speaker.lower() or 'agent' in speaker.lower() or speaker in ['conseillet', 'conseilller', 'consiller', 'conseiler', 'conseilelr']:
label = 'agent'
print(';'.join([label, speaker, ",".join(files)]))
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
station 20101206_RATP_SCD_0325 ['agent', 'agent', 'agent']
conseiller_M 20101206_RATP_SCD_0043 ['agent', 'agent', 'agent']
station 20101206_RATP_SCD_0041 ['agent', 'agent', 'agent']
station 20091112_RATP_SCD_0567 ['agent', 'agent', 'agent']
station 20101206_RATP_SCD_0063 ['agent', 'agent', 'agent']
centrebus 20101206_RATP_SCD_0386 ['agent', 'agent', 'agent']
station 20101206_RATP_SCD_0028 ['agent', 'agent', 'agent']
station 20101206_RATP_SCD_0425 ['agent', 'agent', 'agent']
station 20101206_RATP_SCD_0308 ['agent', 'agent', 'agent']
conseiller 20101206_RATP_SCD_0449 ['agent', 'agent', 'agent']
conseiller_M01 20091112_RATP_SCD_1161 ['agent', 'agent']
20091112_RATP_SCD_0300 ['agent', 'agent']
20091112_RATP_SCD_0377 ['agent', 'agent']
20101206_RATP_SCD_0051 ['agent', 'agent']
station 20101206_RATP_SCD_0120 ['agent', 'agent']
conseiller_M01 20091112_RATP_SCD_0693 ['agent', 'agent']
conseiller_M04 20091112_RATP_SCD_0697 ['agent', 'agent']
Conseiller 2 20091112_RATP_SCD_0213 ['agent', 'agent']
20101206_RATP_SCD_0074 ['agent', 'agent']
Agent 1 20091112_RATP_SCD_0234 ['agent', 'agent']
conseiller_M 20101206_RATP_SCD_0010 ['agent', 'agent']
conseiller_M04 20091112_RATP_SCD_1148 ['agent', 'agent']
agent 20091112_RATP_SCD_0700 ['agent', 'agent']
conseiller_M02 20091112_RATP_SCD_0292 ['agent', 'agent']
20091112_RATP_SCD_1064 ['agent', 'agent']
20101206_RATP_SCD_0416 ['agent', 'agent']
Conseiller 2 20091112_RATP_SCD_0330 ['agent', 'agent']
20101206_RATP_SCD_0094 ['agent', 'agent']
conseiller 20101206_RATP_SCD_0307 ['agent', 'agent']
20101206_RATP_SCD_0230 ['agent', 'agent']
station 20091112_RATP_SCD_1216 ['agent', 'agent']
20101206_RATP_SCD_0339 ['agent', 'agent']
20091112_RATP_SCD_0101 ['agent']
20091112_RATP_SCD_1138 ['agent']
20091112_RATP_SCD_0501 ['agent']
20091112_RATP_SCD_0797 ['agent']
20091112_RATP_SCD_0887 ['agent']
20091112_RATP_SCD_1141 ['agent']
20091112_RATP_SCD_0056 ['agent']
20091112_RATP_SCD_0290 ['agent']
def load(filename):
output = {}
with open(filename) as fp:
for line in fp:
show, speaker_id, speaker_type = line.strip().split('\t')[:3]
output[(show, speaker_id)] = speaker_type
return output
class SpeakerType:
def __init__(self, mapping_filename):
self.mapping = load(mapping_filename)
def resolve(self, show, speaker_id):
key = (show, speaker_id)
if key in self.mapping:
return self.mapping[key]
return None
if __name__ == '__main__':
import sys
speaker_type = SpeakerType(sys.argv[1])
for line in sys.stdin:
print speaker_type.resolve(*line.strip().split())
view.py 0 → 100644
from xml.etree import ElementTree as ET
import sys
def view_file(filename):
root = ET.parse(filename).getroot()
output = [filename.split('/')[-1][:-4], '']
speakers = {}
for speaker in root.findall(".//Speaker"):
speakers[speaker.get('id')] = speaker.get('name')
output.append('Speaker: %s = %s' % (speaker.get('id'), speaker.get('name')))
output.append('')
for turn in root.findall(".//Turn"):
speaker = 'unknown'
if 'speaker' in turn.attrib:
speaker = ';'.join(['%s' % x for x in turn.get('speaker').split()])
text = []
if turn.text != None:
text.append(turn.text.strip())
for node in turn:
text.append(node.tail.strip())
text = [x for x in text if x != '']
if len(text) > 0:
output.append(speaker + ': ' + ' '.join(text))
import pydoc
pydoc.pager('\n'.join(output))
if __name__ == '__main__':
import readline
directory = sys.argv[1]
while True:
filename = raw_input('tsv> ')
try:
view_file(directory + '/' + filename + '.trs')
except IOError:
print 'Error: could not load', filename
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment