Transfer theoretical course (CM) code snippets and examples to this repo

edd42965 · Carlos Ramisch · 127f0b3b · edd42965 · edd42965 · edd42965
Commit edd42965 authored 6 months ago by Carlos Ramisch
--- a/README.md
+++ b/README.md
@@ -2,3 +2,7 @@
 Pedagogical materials for the 2024-2025 version of the advanced NLP course
 of Master 2 in AI and ML, Aix Marseille University and Centrale Marseille.
+* `sequoia`: (simplified) Sequoia corpus used for all lab exercises (TP)
+* `lib`: code given to speed up system development, includes CONLL-U library `conllulib` and evaluation script `accuracy.py`
+* `cm-code`: code snippets shown during theoretical course (CM)
--- a/cm-code/count-supertags.sh
+++ b/cm-code/count-supertags.sh
+#!/usr/bin/bash
+#Count number of different morphological supertags in Sequoia
+cat ../../tp/share/sequoia/sequoia-ud.parseme.frsemcor.simple.full | 
+grep "^[0-9]" | 
+cut -d "	" -f 6 | 
+sort | 
+uniq | 
+wc -l
--- a/cm-code/dumas-textclassif/build_corpus.sh
+++ b/cm-code/dumas-textclassif/build_corpus.sh
+#!/usr/bin/bash
+DUMAS_FOLDER="../../../../../talia/tp/data/alexandre_dumas"
+> dumas_train.txt
+for f in ${DUMAS_FOLDER}/*.train.tok; do 
+  cat $f | 
+  awk '{if(NF >= 15) print $0}' |
+  head -n 1200 |
+  sed -E 's@ ?</?s> ?@@g' | 
+  awk -v f=`basename ${f%.train.tok}` '{print(f, $0)}' |
+  cat >> dumas_train.txt
+done
+> dumas_dev.txt
+for f in ${DUMAS_FOLDER}/*.test.tok; do 
+  cat $f |   
+  awk '{if(NF >= 15) print $0}' |
+  head -n 300 |
+  sed -E 's@ ?</?s> ?@@g' | 
+  awk -v f=`basename ${f%.test.tok}` '{print(f, $0)}' |
+  cat >> dumas_dev.txt
+done
+> dumas_test.txt
+for f in ${DUMAS_FOLDER}/*.test.tok; do 
+  cat $f |   
+  awk '{if(NF >= 15) print $0}' |
+  tail -n 300 |
+  sed -E 's@ ?</?s> ?@@g' | 
+  awk -v f=`basename ${f%.test.tok}` '{print(f, $0)}' |
+  cat >> dumas_test.txt
+done
--- a/cm-code/dumas-textclassif/dumas_dev.txt
+++ b/cm-code/dumas-textclassif/dumas_dev.txt
--- a/cm-code/dumas-textclassif/dumas_test.txt
+++ b/cm-code/dumas-textclassif/dumas_test.txt
--- a/cm-code/dumas-textclassif/dumas_train.txt
+++ b/cm-code/dumas-textclassif/dumas_train.txt
--- a/cm-code/dumas-textclassif/eval_all.sh
+++ b/cm-code/dumas-textclassif/eval_all.sh
+#!/usr/bin/bash
+mkdir -p pred
+for model in bow gru cnn; do
+  for in_type in word char; do
+    echo "Evaluating model model/model-${model}-${in_type}.pt" | tee pred/dumas_test_pred-${model}-${in_type}.acc
+    ./predict_textclass.py dumas_test.txt model/model-${model}-${in_type}.pt > pred/dumas_test_pred-${model}-${in_type}.txt
+    ./eval_textclass.py dumas_test.txt pred/dumas_test_pred-${model}-${in_type}.txt | tee pred/dumas_test_pred-${model}-${in_type}.acc       
+  done
+done
--- a/cm-code/dumas-textclassif/eval_textclass.py
+++ b/cm-code/dumas-textclassif/eval_textclass.py
+#!/usr/bin/env python3
+import sys
+if __name__ == "__main__" :    
+  if len(sys.argv) != 3 : # Prefer using argparse, more flexible
+    print("Usage: {} gold-testfile.txt pred-testfile.pt".format(sys.argv[0]), file=sys.stderr) 
+    sys.exit(-1)
+  with open(sys.argv[1], 'r', encoding='utf-8') as goldfile,\
+       open(sys.argv[2], 'r', encoding='utf-8') as predfile:
+    total = correct = 0
+    for (gline, pline) in zip(goldfile, predfile) :
+      correct += int(gline.strip().split()[0] == pline.strip().split()[0])
+      total += 1
+  print(f"Accuracy = {correct * 100 / total:.2f}")
+# ./eval_textclass.py dumas_test.txt dumas_test_pred-bow.txt
+# ./eval_textclass.py dumas_test.txt dumas_test_pred-gru.txt
--- a/cm-code/dumas-textclassif/learning-curve.ods
+++ b/cm-code/dumas-textclassif/learning-curve.ods
--- a/cm-code/dumas-textclassif/pred/dumas_test_pred-bow-char.txt
+++ b/cm-code/dumas-textclassif/pred/dumas_test_pred-bow-char.txt
--- a/cm-code/dumas-textclassif/pred/dumas_test_pred-bow-word.txt
+++ b/cm-code/dumas-textclassif/pred/dumas_test_pred-bow-word.txt
--- a/cm-code/dumas-textclassif/pred/dumas_test_pred-cnn-char.txt
+++ b/cm-code/dumas-textclassif/pred/dumas_test_pred-cnn-char.txt
--- a/cm-code/dumas-textclassif/pred/dumas_test_pred-cnn-word.txt
+++ b/cm-code/dumas-textclassif/pred/dumas_test_pred-cnn-word.txt
--- a/cm-code/dumas-textclassif/pred/dumas_test_pred-gru-char.txt
+++ b/cm-code/dumas-textclassif/pred/dumas_test_pred-gru-char.txt
--- a/cm-code/dumas-textclassif/pred/dumas_test_pred-gru-word.txt
+++ b/cm-code/dumas-textclassif/pred/dumas_test_pred-gru-word.txt
--- a/cm-code/dumas-textclassif/predict_textclass.py
+++ b/cm-code/dumas-textclassif/predict_textclass.py
+#!/usr/bin/env python3
+import sys, torch, collections, tqdm, pdb
+import torch.nn as nn
+from train_textclass import read_corpus, BOWClassifier, GRUClassifier, CNNClassifier
+################################################################################
+def rev_vocab(vocab):
+    rev_dict = {y: x for x, y in vocab.items()}
+    return [rev_dict[k] for k in range(len(rev_dict))]
+################################################################################
+if __name__ == "__main__" :    
+  if len(sys.argv) != 3 : # Prefer using argparse, more flexible
+    print("Usage: {} testfile.txt modelfile.pt".format(sys.argv[0]), file=sys.stderr) 
+    sys.exit(-1)
+  load_dict = torch.load(sys.argv[2], weights_only=False)
+  wordvocab = load_dict["wordvocab"]
+  tagvocab = load_dict["tagvocab"]
+  hp = load_dict["hyperparams"]  
+  if hp["model_type"] == 'bow' :
+    model = BOWClassifier(hp["d_embed"], len(wordvocab), len(tagvocab))
+  elif hp["model_type"] == 'gru' :
+    model = GRUClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
+  else : #if hp["model_type"] == 'cnn' :
+    model = CNNClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
+  model.load_state_dict(load_dict["model_params"])
+  words, _, _, _ = read_corpus(sys.argv[1], wordvocab, tagvocab, hp["in_type"], 
+                               train_mode=False, batch_mode=False)
+  revtagvocab = rev_vocab(tagvocab)
+  for sentence in words :
+    pred_scores = model(torch.LongTensor([sentence])) # No need to batch
+    print(revtagvocab[pred_scores.argmax()]) # No need to softmax
--- a/cm-code/dumas-textclassif/train_all.sh
+++ b/cm-code/dumas-textclassif/train_all.sh
+#!/usr/bin/bash
+mkdir -p model
+for model in bow gru cnn; do
+  for in_type in word char; do
+    ./train_textclass.py dumas_train.txt dumas_dev.txt ${model} ${in_type}
+    mv model.pt model/model-${model}-${in_type}.pt          
+  done    
+done
--- a/cm-code/dumas-textclassif/train_textclass.py
+++ b/cm-code/dumas-textclassif/train_textclass.py
+#!/usr/bin/env python3
+import sys, torch, collections, tqdm, pdb
+import torch.nn as nn
+from torch.utils.data import TensorDataset, DataLoader
+################################################################################
+class CNNClassifier(nn.Module):
+  def __init__(self, d_embed, d_hidden, d_in, d_out):
+    super().__init__() 
+    self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
+    self.conv = nn.Conv1d(d_embed, d_hidden, kernel_size=5)    
+    self.dropout = nn.Dropout(0.1)
+    self.decision = nn.Linear(d_hidden, d_out)      
+  def forward(self, idx_words):
+    embedded = self.embed(idx_words)  
+    conved = self.conv(embedded.transpose(2,1))  
+    hidden = nn.functional.max_pool1d(conved, conved.shape[-1])
+    return self.decision(self.dropout(hidden.squeeze()))   
+################################################################################
+class GRUClassifier(nn.Module):
+  def __init__(self, d_embed, d_hidden, d_in, d_out):
+    super().__init__() 
+    self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
+    self.gru = nn.GRU(d_embed, d_hidden, batch_first=True, bias=False)    
+    self.dropout = nn.Dropout(0.1)
+    self.decision = nn.Linear(d_hidden, d_out)      
+  def forward(self, idx_words):
+    embedded = self.embed(idx_words)    
+    hidden = self.gru(embedded)[1].squeeze(dim=0)    
+    return self.decision(self.dropout(hidden))    
+################################################################################
+class BOWClassifier(nn.Module):
+  def __init__(self, d_embed, d_in, d_out):
+    super().__init__() 
+    self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
+    self.dropout = nn.Dropout(0.3)
+    self.decision = nn.Linear(d_embed, d_out)      
+  def forward(self, idx_words):
+    embedded = self.embed(idx_words)
+    averaged = torch.mean(embedded, dim=1) # dim 0 is batch    
+    return self.decision(self.dropout(averaged))
+################################################################################
+def perf(model, dev_loader, criterion):
+  model.eval()
+  total_loss = correct = 0
+  for (X, y) in dev_loader:
+    with torch.no_grad():
+      y_scores = model(X) 
+      total_loss += criterion(y_scores, y)
+      y_pred = torch.max(y_scores, dim=1)[1] # argmax
+      correct += torch.sum(y_pred.data == y)
+  total = len(dev_loader.dataset)
+  return total_loss / total, correct / total
+################################################################################
+def fit(model, epochs, train_loader, dev_loader):
+  criterion = nn.CrossEntropyLoss()
+  optimizer = torch.optim.Adam(model.parameters()) 
+  for epoch in range(epochs):
+    model.train()
+    total_loss = 0
+    for (X, y) in tqdm.tqdm(train_loader) :      
+      optimizer.zero_grad()
+      y_scores = model(X)    
+      loss = criterion(y_scores, y)
+      loss.backward()
+      optimizer.step()
+      total_loss += loss.item()  
+    print("train_loss = {:.4f}".format(total_loss / len(train_loader.dataset)))
+    print("dev_loss = {:.4f} dev_acc = {:.4f}".format(*perf(model, dev_loader, criterion)))
+################################################################################
+def pad_tensor(X, max_len):
+  res = torch.full((len(X), max_len), 0)
+  for (i, row) in enumerate(X) :
+    x_len = min(max_len, len(X[i]))
+    res[i,:x_len] = torch.LongTensor(X[i][:x_len])
+  return res
+################################################################################
+def read_corpus(filename, wordvocab, tagvocab, in_type, train_mode=True, batch_mode=True):
+  if train_mode :
+    wordvocab = collections.defaultdict(lambda : len(wordvocab))
+    wordvocab["<PAD>"]; wordvocab["<UNK>"] # Create special token IDs      
+    tagvocab = collections.defaultdict(lambda : len(tagvocab))
+  words, tags = [], []
+  with open(filename, 'r', encoding="utf-8") as corpus:
+    for line in corpus:
+      fields = line.strip().split()
+      tags.append(tagvocab[fields[0]])
+      fields = " ".join(fields[1:]) if in_type == "char" else  fields[1:]
+      if train_mode :
+        words.append([wordvocab[w] for w in fields])
+      else :
+        words.append([wordvocab.get(w, wordvocab["<UNK>"]) for w in fields])
+  if batch_mode :
+    dataset = TensorDataset(pad_tensor(words, 40), torch.LongTensor(tags))
+    return DataLoader(dataset, batch_size=32, shuffle=train_mode), wordvocab, tagvocab 
+  else :
+    return words, tags, wordvocab, tagvocab
+################################################################################
+if __name__ == "__main__" : 
+  if len(sys.argv) != 5 or \
+     sys.argv[3] not in ['bow', 'gru', 'cnn'] or \
+     sys.argv[4] not in ['word', 'char'] : # Argparse!
+    print("Usage: {} trainfile.txt devfile.txt bow|gru|cnn word|char".format(sys.argv[0]), file=sys.stderr) 
+    sys.exit(-1)   
+  hp = {"model_type": sys.argv[3], "in_type": sys.argv[4], "d_embed": 250, "d_hidden": 200}
+  train_loader, wordvocab, tagvocab = read_corpus(sys.argv[1], None, None, hp["in_type"])
+  dev_loader, _, _ = read_corpus(sys.argv[2], wordvocab, tagvocab, hp["in_type"], train_mode=False)
+  if hp["model_type"] == "bow" :
+    model = BOWClassifier(hp["d_embed"], len(wordvocab), len(tagvocab))
+  elif hp["model_type"] == "gru" :
+    model = GRUClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
+  else: #if hp["model_type"] == "cnn" :
+    model = CNNClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
+  fit(model, 15, train_loader, dev_loader)
+  torch.save({"wordvocab": dict(wordvocab), 
+              "tagvocab": dict(tagvocab), 
+              "model_params": model.state_dict(),
+              "hyperparams": hp}, "model.pt")
--- a/cm-code/eval-morph-gold.conllu
+++ b/cm-code/eval-morph-gold.conllu
+# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE
+# sent_id = annodis.er_00192
+# text = La gare routière attend toujours ses illuminations
+1	La	le	DET	_	Gender=Fem|Number=Sing	2	det	_	_	*	*	*
+2	gare	gare	NOUN	_	Gender=Fem|Number=Sing	4	nsubj	_	_	1:_|MWE|SYNT	*	*
+3	routière	routier	ADJ	_	Gender=Fem|Number=Sing	2	amod	_	_	1	*	*
+4	attend	attendre	VERB	_	Number=Sing	0	root	_	_	*	**
+5	toujours	toujours	ADV	_	_	4	advmod	_	_	*	*	*
+6	ses	son	DET	_	Number=Plur	7	det	_	_	*	*	*
+7	illuminations	illumination	NOUN	_	Gender=Fem|Number=Plur	4	obj	_	_	*	Artifact
--- a/cm-code/eval-morph-pred.conllu
+++ b/cm-code/eval-morph-pred.conllu
+# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE
+# sent_id = annodis.er_00192
+# text = La gare routière attend toujours ses illuminations
+1	La	le	DET	_	Number=Sing	2	det	_	_	*	*	*
+2	gare	gare	NOUN	_	Gender=Masc|Number=Sing	4	nsubj	_	_	1:_|MWE|SYNT	*	*
+3	routière	routier	ADJ	_	Gender=Fem|Number=Sing	2	amod	_	_	1	*	*
+4	attend	attendre	VERB	_	Number=Sing	0	root	_	_	*	**
+5	toujours	toujours	ADV	_	Gender=Fem|Number=Plur	4	advmod	_	_	*	*	*
+6	ses	son	DET	_	Gender=Masc|Number=Plur	7	det	_	_	*	*	*
+7	illuminations	illumination	NOUN	_	Gender=Fem	4	obj	_	_	*	Artifact