Skip to content
Snippets Groups Projects
Commit edd42965 authored by Carlos Ramisch's avatar Carlos Ramisch
Browse files

Transfer theoretical course (CM) code snippets and examples to this repo

parent 127f0b3b
No related branches found
No related tags found
No related merge requests found
Showing
with 18279 additions and 0 deletions
...@@ -2,3 +2,7 @@ ...@@ -2,3 +2,7 @@
Pedagogical materials for the 2024-2025 version of the advanced NLP course Pedagogical materials for the 2024-2025 version of the advanced NLP course
of Master 2 in AI and ML, Aix Marseille University and Centrale Marseille. of Master 2 in AI and ML, Aix Marseille University and Centrale Marseille.
* `sequoia`: (simplified) Sequoia corpus used for all lab exercises (TP)
* `lib`: code given to speed up system development, includes CONLL-U library `conllulib` and evaluation script `accuracy.py`
* `cm-code`: code snippets shown during theoretical course (CM)
#!/usr/bin/bash
#Count number of different morphological supertags in Sequoia
cat ../../tp/share/sequoia/sequoia-ud.parseme.frsemcor.simple.full |
grep "^[0-9]" |
cut -d " " -f 6 |
sort |
uniq |
wc -l
#!/usr/bin/bash
DUMAS_FOLDER="../../../../../talia/tp/data/alexandre_dumas"
> dumas_train.txt
for f in ${DUMAS_FOLDER}/*.train.tok; do
cat $f |
awk '{if(NF >= 15) print $0}' |
head -n 1200 |
sed -E 's@ ?</?s> ?@@g' |
awk -v f=`basename ${f%.train.tok}` '{print(f, $0)}' |
cat >> dumas_train.txt
done
> dumas_dev.txt
for f in ${DUMAS_FOLDER}/*.test.tok; do
cat $f |
awk '{if(NF >= 15) print $0}' |
head -n 300 |
sed -E 's@ ?</?s> ?@@g' |
awk -v f=`basename ${f%.test.tok}` '{print(f, $0)}' |
cat >> dumas_dev.txt
done
> dumas_test.txt
for f in ${DUMAS_FOLDER}/*.test.tok; do
cat $f |
awk '{if(NF >= 15) print $0}' |
tail -n 300 |
sed -E 's@ ?</?s> ?@@g' |
awk -v f=`basename ${f%.test.tok}` '{print(f, $0)}' |
cat >> dumas_test.txt
done
Source diff could not be displayed: it is too large. Options to address this: view the blob.
Source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
#!/usr/bin/bash
mkdir -p pred
for model in bow gru cnn; do
for in_type in word char; do
echo "Evaluating model model/model-${model}-${in_type}.pt" | tee pred/dumas_test_pred-${model}-${in_type}.acc
./predict_textclass.py dumas_test.txt model/model-${model}-${in_type}.pt > pred/dumas_test_pred-${model}-${in_type}.txt
./eval_textclass.py dumas_test.txt pred/dumas_test_pred-${model}-${in_type}.txt | tee pred/dumas_test_pred-${model}-${in_type}.acc
done
done
#!/usr/bin/env python3
import sys
if __name__ == "__main__" :
if len(sys.argv) != 3 : # Prefer using argparse, more flexible
print("Usage: {} gold-testfile.txt pred-testfile.pt".format(sys.argv[0]), file=sys.stderr)
sys.exit(-1)
with open(sys.argv[1], 'r', encoding='utf-8') as goldfile,\
open(sys.argv[2], 'r', encoding='utf-8') as predfile:
total = correct = 0
for (gline, pline) in zip(goldfile, predfile) :
correct += int(gline.strip().split()[0] == pline.strip().split()[0])
total += 1
print(f"Accuracy = {correct * 100 / total:.2f}")
# ./eval_textclass.py dumas_test.txt dumas_test_pred-bow.txt
# ./eval_textclass.py dumas_test.txt dumas_test_pred-gru.txt
File added
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python3
import sys, torch, collections, tqdm, pdb
import torch.nn as nn
from train_textclass import read_corpus, BOWClassifier, GRUClassifier, CNNClassifier
################################################################################
def rev_vocab(vocab):
rev_dict = {y: x for x, y in vocab.items()}
return [rev_dict[k] for k in range(len(rev_dict))]
################################################################################
if __name__ == "__main__" :
if len(sys.argv) != 3 : # Prefer using argparse, more flexible
print("Usage: {} testfile.txt modelfile.pt".format(sys.argv[0]), file=sys.stderr)
sys.exit(-1)
load_dict = torch.load(sys.argv[2], weights_only=False)
wordvocab = load_dict["wordvocab"]
tagvocab = load_dict["tagvocab"]
hp = load_dict["hyperparams"]
if hp["model_type"] == 'bow' :
model = BOWClassifier(hp["d_embed"], len(wordvocab), len(tagvocab))
elif hp["model_type"] == 'gru' :
model = GRUClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
else : #if hp["model_type"] == 'cnn' :
model = CNNClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
model.load_state_dict(load_dict["model_params"])
words, _, _, _ = read_corpus(sys.argv[1], wordvocab, tagvocab, hp["in_type"],
train_mode=False, batch_mode=False)
revtagvocab = rev_vocab(tagvocab)
for sentence in words :
pred_scores = model(torch.LongTensor([sentence])) # No need to batch
print(revtagvocab[pred_scores.argmax()]) # No need to softmax
#!/usr/bin/bash
mkdir -p model
for model in bow gru cnn; do
for in_type in word char; do
./train_textclass.py dumas_train.txt dumas_dev.txt ${model} ${in_type}
mv model.pt model/model-${model}-${in_type}.pt
done
done
#!/usr/bin/env python3
import sys, torch, collections, tqdm, pdb
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
################################################################################
class CNNClassifier(nn.Module):
def __init__(self, d_embed, d_hidden, d_in, d_out):
super().__init__()
self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
self.conv = nn.Conv1d(d_embed, d_hidden, kernel_size=5)
self.dropout = nn.Dropout(0.1)
self.decision = nn.Linear(d_hidden, d_out)
def forward(self, idx_words):
embedded = self.embed(idx_words)
conved = self.conv(embedded.transpose(2,1))
hidden = nn.functional.max_pool1d(conved, conved.shape[-1])
return self.decision(self.dropout(hidden.squeeze()))
################################################################################
class GRUClassifier(nn.Module):
def __init__(self, d_embed, d_hidden, d_in, d_out):
super().__init__()
self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
self.gru = nn.GRU(d_embed, d_hidden, batch_first=True, bias=False)
self.dropout = nn.Dropout(0.1)
self.decision = nn.Linear(d_hidden, d_out)
def forward(self, idx_words):
embedded = self.embed(idx_words)
hidden = self.gru(embedded)[1].squeeze(dim=0)
return self.decision(self.dropout(hidden))
################################################################################
class BOWClassifier(nn.Module):
def __init__(self, d_embed, d_in, d_out):
super().__init__()
self.embed = nn.Embedding(d_in, d_embed, padding_idx=0)
self.dropout = nn.Dropout(0.3)
self.decision = nn.Linear(d_embed, d_out)
def forward(self, idx_words):
embedded = self.embed(idx_words)
averaged = torch.mean(embedded, dim=1) # dim 0 is batch
return self.decision(self.dropout(averaged))
################################################################################
def perf(model, dev_loader, criterion):
model.eval()
total_loss = correct = 0
for (X, y) in dev_loader:
with torch.no_grad():
y_scores = model(X)
total_loss += criterion(y_scores, y)
y_pred = torch.max(y_scores, dim=1)[1] # argmax
correct += torch.sum(y_pred.data == y)
total = len(dev_loader.dataset)
return total_loss / total, correct / total
################################################################################
def fit(model, epochs, train_loader, dev_loader):
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(epochs):
model.train()
total_loss = 0
for (X, y) in tqdm.tqdm(train_loader) :
optimizer.zero_grad()
y_scores = model(X)
loss = criterion(y_scores, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print("train_loss = {:.4f}".format(total_loss / len(train_loader.dataset)))
print("dev_loss = {:.4f} dev_acc = {:.4f}".format(*perf(model, dev_loader, criterion)))
################################################################################
def pad_tensor(X, max_len):
res = torch.full((len(X), max_len), 0)
for (i, row) in enumerate(X) :
x_len = min(max_len, len(X[i]))
res[i,:x_len] = torch.LongTensor(X[i][:x_len])
return res
################################################################################
def read_corpus(filename, wordvocab, tagvocab, in_type, train_mode=True, batch_mode=True):
if train_mode :
wordvocab = collections.defaultdict(lambda : len(wordvocab))
wordvocab["<PAD>"]; wordvocab["<UNK>"] # Create special token IDs
tagvocab = collections.defaultdict(lambda : len(tagvocab))
words, tags = [], []
with open(filename, 'r', encoding="utf-8") as corpus:
for line in corpus:
fields = line.strip().split()
tags.append(tagvocab[fields[0]])
fields = " ".join(fields[1:]) if in_type == "char" else fields[1:]
if train_mode :
words.append([wordvocab[w] for w in fields])
else :
words.append([wordvocab.get(w, wordvocab["<UNK>"]) for w in fields])
if batch_mode :
dataset = TensorDataset(pad_tensor(words, 40), torch.LongTensor(tags))
return DataLoader(dataset, batch_size=32, shuffle=train_mode), wordvocab, tagvocab
else :
return words, tags, wordvocab, tagvocab
################################################################################
if __name__ == "__main__" :
if len(sys.argv) != 5 or \
sys.argv[3] not in ['bow', 'gru', 'cnn'] or \
sys.argv[4] not in ['word', 'char'] : # Argparse!
print("Usage: {} trainfile.txt devfile.txt bow|gru|cnn word|char".format(sys.argv[0]), file=sys.stderr)
sys.exit(-1)
hp = {"model_type": sys.argv[3], "in_type": sys.argv[4], "d_embed": 250, "d_hidden": 200}
train_loader, wordvocab, tagvocab = read_corpus(sys.argv[1], None, None, hp["in_type"])
dev_loader, _, _ = read_corpus(sys.argv[2], wordvocab, tagvocab, hp["in_type"], train_mode=False)
if hp["model_type"] == "bow" :
model = BOWClassifier(hp["d_embed"], len(wordvocab), len(tagvocab))
elif hp["model_type"] == "gru" :
model = GRUClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
else: #if hp["model_type"] == "cnn" :
model = CNNClassifier(hp["d_embed"], hp["d_hidden"], len(wordvocab), len(tagvocab))
fit(model, 15, train_loader, dev_loader)
torch.save({"wordvocab": dict(wordvocab),
"tagvocab": dict(tagvocab),
"model_params": model.state_dict(),
"hyperparams": hp}, "model.pt")
# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE
# sent_id = annodis.er_00192
# text = La gare routière attend toujours ses illuminations
1 La le DET _ Gender=Fem|Number=Sing 2 det _ _ * * *
2 gare gare NOUN _ Gender=Fem|Number=Sing 4 nsubj _ _ 1:_|MWE|SYNT * *
3 routière routier ADJ _ Gender=Fem|Number=Sing 2 amod _ _ 1 * *
4 attend attendre VERB _ Number=Sing 0 root _ _ * **
5 toujours toujours ADV _ _ 4 advmod _ _ * * *
6 ses son DET _ Number=Plur 7 det _ _ * * *
7 illuminations illumination NOUN _ Gender=Fem|Number=Plur 4 obj _ _ * Artifact
# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC PARSEME:MWE FRSEMCOR:NOUN PARSEME:NE
# sent_id = annodis.er_00192
# text = La gare routière attend toujours ses illuminations
1 La le DET _ Number=Sing 2 det _ _ * * *
2 gare gare NOUN _ Gender=Masc|Number=Sing 4 nsubj _ _ 1:_|MWE|SYNT * *
3 routière routier ADJ _ Gender=Fem|Number=Sing 2 amod _ _ 1 * *
4 attend attendre VERB _ Number=Sing 0 root _ _ * **
5 toujours toujours ADV _ Gender=Fem|Number=Plur 4 advmod _ _ * * *
6 ses son DET _ Gender=Masc|Number=Plur 7 det _ _ * * *
7 illuminations illumination NOUN _ Gender=Fem 4 obj _ _ * Artifact
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment