Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • master
  • tania
2 results

Target

Select target project
  • alexis.nasr/tbp
1 result
Select Git revision
  • master
  • tania
2 results
Show changes
Commits on Source (2)
*.tgz
data/
expe/out/
slides/
*.pytorch
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
\ No newline at end of file
# tbp
## List of todos
### Todo
- [ ] complete implemented models with word and POS embeddings @tatiana.bladier
- [ ] word embeddings
- [ ] POS embeddings
### In Progress
- [ ]
### Done ✓
- [x]
##POS
NULL
ROOT
DET
NOUN
ADP
VERB
ADJ
PUNCT
CCONJ
PROPN
AUX
ADV
PRON
NUM
X
SCONJ
PART
INTJ
SYM
##LABEL
NULL
ROOT
det
nsubj
case
nmod
root
amod
obl
punct
cc
conj
aux
advmod
xcomp
appos
acl
dep
obj
iobj
cop
mark
advcl
flat
fixed
nummod
compound
ccomp
parataxis
expl
csubj
dislocated
orphan
discourse
vocative
goeswith
##EOS
NULL
ROOT
0
1
lang=$1
mkdir -p out
train_conll="../data/train_${lang}.conllu"
train_proj_conll="./out/train_${lang}_proj.conllu"
train_mcf="./out/train_${lang}_pgle.mcf"
......@@ -12,6 +13,7 @@ dev_cff="./out/dev_${lang}.cff"
dev_word_limit="5000"
test_conll="../data/test_${lang}.conllu"
test_mcf="./out/test_${lang}_pgle.mcf"
test_mcf_hyp="./out/test_${lang}_hyp.mcf"
test_word_limit="700"
......@@ -20,7 +22,7 @@ feat_model="basic.fm"
dicos="./out/${lang}_train.dic"
dicos="PLE.dic"
model="./out/${lang}.keras"
model="./out/${lang}.pytorch"
results="./out/${lang}.res"
mcd_pgle="PGLE.mcd"
......@@ -41,11 +43,17 @@ python3 ../src/mcf2cff.py $dev_mcf $feat_model $mcd_pgle $dicos $dev_cff $dev_wo
python3 ../src/mcf2cff.py $train_mcf $feat_model $mcd_pgle $dicos $train_cff $train_word_limit
python3 ../src/tbp_train.py $train_cff $dev_cff $model
#python3 ../src/tbp_train.py $train_cff $dev_cff $model
python3 ../src/tbp_train_pytorch.py $train_cff $dev_cff $model
#python3 ../src/tbp_train_keras.py $train_cff $dev_cff $model
#python3 ../src/tbp_decode.py $test_mcf $model $dicos $feat_model $mcd_pgle $test_word_limit > $test_mcf_hyp
python3 ../src/tbp_decode_pytorch.py $test_mcf $model $dicos $feat_model $mcd_pgle $test_word_limit $train_cff $dev_cff > $test_mcf_hyp
#python3 ../src/tbp_decode_keras.py $test_mcf $model $dicos $feat_model $mcd_pgle $test_word_limit > $test_mcf_hyp
python3 ../src/tbp_decode.py $test_mcf $model $dicos $feat_model $mcd_pgle $test_word_limit > $test_mcf_hyp
python3 ../src/eval_mcf.py $test_mcf $test_mcf_hyp $mcd_pgle $mcd_pgle $lang > $results
python3 ../src/eval_mcf.py $test_mcf $test_mcf_hyp $mcd_pgle $mcd_pgle $lang verbose > $results
......
......@@ -38,6 +38,7 @@ class Config:
sys.stderr.write("cannot reduce an empty stack !\n")
return False
#print('AHHHHHHHHH', int(self.getBuffer().getWord(self.getStack().top()).getFeat('GOV')))
if int(self.getBuffer().getWord(self.getStack().top()).getFeat('GOV')) == Word.invalidGov() :
sys.stderr.write("cannot reduce the stack if top element does not have a governor !\n")
return False
......@@ -52,6 +53,7 @@ class Config:
govIndex = self.getStack().top()
depIndex = self.getBuffer().currentIndex
#print('560 govIndex, depIndex ', govIndex, depIndex)
self.getBuffer().getCurrentWord().setFeat('LABEL', label)
self.getBuffer().getCurrentWord().setFeat('GOV', str(govIndex - depIndex))
self.getBuffer().getWord(self.getStack().top()).addRightDaughter(depIndex)
......@@ -66,6 +68,7 @@ class Config:
govIndex = self.getBuffer().currentIndex
depIndex = self.getStack().top()
#print('561 govIndex, depIndex ', govIndex, depIndex)
self.getBuffer().getWord(self.getStack().top()).setFeat('LABEL', label)
self.getBuffer().getWord(self.getStack().top()).setFeat('GOV', str(govIndex - depIndex))
self.getBuffer().getCurrentWord().addLeftDaughter(depIndex)
......@@ -126,19 +129,19 @@ class Config:
container = featTuple[2]
index = featTuple[3]
word = self.getWordWithRelativeIndex(containe, index)
word = self.getWordWithRelativeIndex(container, index)
if word == None :
return 'NULL'
return string(len(word.getLeftDaughters()))
return str(len(word.getLeftDaughters()))
def getNrDepFeat(self, featTuple):
container = featTuple[2]
index = featTuple[3]
word = self.getWordWithRelativeIndex(containe, index)
word = self.getWordWithRelativeIndex(container, index)
if word == None :
return 'NULL'
return string(len(word.getRightDaughters()))
return str(len(word.getRightDaughters()))
def getLlDepFeat(self, featTuple):
......@@ -148,7 +151,7 @@ class Config:
return 'NULL'
def getStackHeightFeat(self, featTuple):
string(self.getStack().getLength())
str(self.getStack().getLength())
def getDistFeat(self, featTuple):
containerWord1 = featTuple[1]
......
......@@ -5,9 +5,11 @@ class Dicos:
self.content = {}
if mcd :
self.initializeWithMcd(mcd)
if fileName :
self.initializeWithDicoFile(fileName)
def initializeWithMcd(self, mcd):
for index in range(mcd.getNbCol()):
if(mcd.getColStatus(index) == 'KEEP') and (mcd.getColType(index) == 'SYM') :
......
......@@ -61,7 +61,7 @@ class FeatModel:
label = self.getFeatLabel(i)
size = dicos.getDico(label).getSize()
position = dicos.getCode(label, featVec[i])
#print('featureName = ', featureName, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
#print('featureName = ', label, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
inputVector[origin + position] = 1
origin += size
return inputVector
......@@ -73,7 +73,7 @@ class FeatModel:
label = self.getFeatLabel(i)
size = dicos.getDico(label).getSize()
position = dicos.getCode(label, featVec[i])
# print('featureName = ', featureName, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
#print('featureName = ', label, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
# print('value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
inputVector[i] = position
origin += size
......
......@@ -3,6 +3,7 @@ import numpy as np
class Moves:
def __init__(self, dicos):
self.dicoLabels = dicos.getDico('LABEL')
if not self.dicoLabels :
print("cannot find LABEL in dicos")
exit(1)
......@@ -26,14 +27,18 @@ class Moves:
if(mvtType == 'LEFT'): return 3 + 2 * labelCode + 1
def mvtDecode(self, mvt_Code):
if(mvt_Code == 0) : return ('SHIFT', 'NULL')
if(mvt_Code == 0) :
return ('SHIFT', 'NULL')
if(mvt_Code == 1) : return ('REDUCE', 'NULL')
if(mvt_Code == 2) : return ('ROOT', 'NULL')
if mvt_Code % 2 == 0 : #LEFT
labelCode = int((mvt_Code - 4) / 2)
#print("label code: ", labelCode)
return ('LEFT', self.dicoLabels.getSymbol(labelCode))
else :
labelCode = int((mvt_Code - 3)/ 2)
#print("label code: ", labelCode)
return ('RIGHT', self.dicoLabels.getSymbol(labelCode))
def buildOutputVectorOneHot(self, mvt):
......@@ -46,4 +51,5 @@ class Moves:
outputVector = np.zeros(1, dtype="int32")
codeMvt = self.mvtCode(mvt)
outputVector[0] = codeMvt
return outputVector
......@@ -40,7 +40,7 @@ class Word:
else:
print('\t', end='')
print(self.getFeat(mcd.getColName(columnNb)), end='')
# print('')
print('')
@staticmethod
def fakeWordConll():
......
......@@ -32,7 +32,8 @@ class WordBuffer:
def affiche(self, mcd):
for w in self.array:
w.affiche(mcd)
print('')
#print('')
#print(w.affiche(mcd))
def getLength(self):
return len(self.array)
......
......@@ -19,10 +19,10 @@ else:
verbose = False
#print('reading mcd from file :', refMcdFileName)
print('reading mcd from file :', refMcdFileName)
refMcd = Mcd(refMcdFileName)
#print('reading mcd from file :', hypMcdFileName)
print('reading mcd from file :', hypMcdFileName)
hypMcd = Mcd(hypMcdFileName)
GovColIndex = refMcd.locateCol('GOV')
......
......@@ -96,7 +96,7 @@ featModel = FeatModel(featModelFileName, dicos)
inputSize = featModel.getInputSize()
outputSize = moves.getNb()
print('input size = ', inputSize, 'outputSize =' , outputSize)
print('featModel input size = ', inputSize, 'outputSize =' , outputSize)
print('preparing training data')
prepareData(mcd, mcfFileName, featModel, moves, dataFileName, wordsLimit)
......
from matplotlib import pyplot as plt
import matplotlib.cm as cm
import numpy as np
import torch
from IPython.display import HTML, display
def set_default(figsize=(10, 10), dpi=100):
plt.style.use(['dark_background', 'bmh'])
plt.rc('axes', facecolor='k')
plt.rc('figure', facecolor='k')
plt.rc('figure', figsize=figsize, dpi=dpi)
def plot_data(X, y, d=0, auto=False, zoom=1):
X = X.cpu()
y = y.cpu()
plt.scatter(X.numpy()[:, 0], X.numpy()[:, 1], c=y, s=20, cmap=plt.cm.Spectral)
plt.axis('square')
plt.axis(np.array((-1.1, 1.1, -1.1, 1.1)) * zoom)
if auto is True: plt.axis('equal')
plt.axis('off')
_m, _c = 0, '.15'
plt.axvline(0, ymin=_m, color=_c, lw=1, zorder=0)
plt.axhline(0, xmin=_m, color=_c, lw=1, zorder=0)
def plot_model(X, y, model):
model.cpu()
mesh = np.arange(-1.1, 1.1, 0.01)
xx, yy = np.meshgrid(mesh, mesh)
with torch.no_grad():
data = torch.from_numpy(np.vstack((xx.reshape(-1), yy.reshape(-1))).T).float()
Z = model(data).detach()
Z = np.argmax(Z, axis=1).reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.3)
plot_data(X, y)
def show_scatterplot(X, colors, title='', axis=True):
colors = cm.rainbow(np.linspace(0, 1, len(ys)))
X = X.numpy()
# plt.figure()
plt.axis('equal')
plt.scatter(X[:, 0], X[:, 1], c=colors, s=30)
# plt.grid(True)
plt.title(title)
plt.axis('off')
_m, _c = 0, '.15'
if axis:
plt.axvline(0, ymin=_m, color=_c, lw=1, zorder=0)
plt.axhline(0, xmin=_m, color=_c, lw=1, zorder=0)
def plot_bases(bases, plotting=True, width=0.04):
bases[2:] -= bases[:2]
# if plot_bases.a: plot_bases.a.set_visible(False)
# if plot_bases.b: plot_bases.b.set_visible(False)
if plotting:
plot_bases.a = plt.arrow(*bases[0], *bases[2], width=width, color='r', zorder=10, alpha=1., length_includes_head=True)
plot_bases.b = plt.arrow(*bases[1], *bases[3], width=width, color='g', zorder=10, alpha=1., length_includes_head=True)
plot_bases.a = None
plot_bases.b = None
def show_mat(mat, vect, prod, threshold=-1):
# Subplot grid definition
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharex=False, sharey=True,
gridspec_kw={'width_ratios':[5,1,1]})
# Plot matrices
cax1 = ax1.matshow(mat.numpy(), clim=(-1, 1))
ax2.matshow(vect.numpy(), clim=(-1, 1))
cax3 = ax3.matshow(prod.numpy(), clim=(threshold, 1))
# Set titles
ax1.set_title(f'A: {mat.size(0)} \u00D7 {mat.size(1)}')
ax2.set_title(f'a^(i): {vect.numel()}')
ax3.set_title(f'p: {prod.numel()}')
# Remove xticks for vectors
ax2.set_xticks(tuple())
ax3.set_xticks(tuple())
# Plot colourbars
fig.colorbar(cax1, ax=ax2)
fig.colorbar(cax3, ax=ax3)
# Fix y-axis limits
ax1.set_ylim(bottom=max(len(prod), len(vect)) - 0.5)
colors = dict(
aqua='#8dd3c7',
yellow='#ffffb3',
lavender='#bebada',
red='#fb8072',
blue='#80b1d3',
orange='#fdb462',
green='#b3de69',
pink='#fccde5',
grey='#d9d9d9',
violet='#bc80bd',
unk1='#ccebc5',
unk2='#ffed6f',
)
def _cstr(s, color='black'):
if s == ' ':
return f'<text style=color:#000;padding-left:10px;background-color:{color}> </text>'
else:
return f'<text style=color:#000;background-color:{color}>{s} </text>'
# print html
def _print_color(t):
display(HTML(''.join([_cstr(ti, color=ci) for ti, ci in t])))
# get appropriate color for value
def _get_clr(value):
colors = ('#85c2e1', '#89c4e2', '#95cae5', '#99cce6', '#a1d0e8',
'#b2d9ec', '#baddee', '#c2e1f0', '#eff7fb', '#f9e8e8',
'#f9e8e8', '#f9d4d4', '#f9bdbd', '#f8a8a8', '#f68f8f',
'#f47676', '#f45f5f', '#f34343', '#f33b3b', '#f42e2e')
value = int((value * 100) / 5)
if value == len(colors): value -= 1 # fixing bugs...
return colors[value]
def _visualise_values(output_values, result_list):
text_colours = []
for i in range(len(output_values)):
text = (result_list[i], _get_clr(output_values[i]))
text_colours.append(text)
_print_color(text_colours)
def print_colourbar():
color_range = torch.linspace(-2.5, 2.5, 20)
to_print = [(f'{x:.2f}', _get_clr((x+2.5)/5)) for x in color_range]
_print_color(to_print)
# Let's only focus on the last time step for now
# First, the cell state (Long term memory)
def plot_state(data, state, b, decoder, idx_to_symbol):
actual_data = decoder(data[b, :, :].numpy(), idx_to_symbol)
seq_len = len(actual_data)
seq_len_w_pad = len(state)
for s in range(state.size(2)):
states = torch.sigmoid(state[:, b, s])
_visualise_values(states[seq_len_w_pad - seq_len:], list(actual_data))
\ No newline at end of file
import math
import numpy as np
import six
""" read file in cff format
the contents of such files look like this:
133
75
0
0 0 1 4 2 0 0
0
0 1 4 2 3 1 0
the first two lines represent TODO
"""
def readFile_cff(filepath):
lines = []
with open(filepath) as file_in:
for line in file_in:
lines.append(line)
inputSize = int(lines[0].strip())
outputSize = int(lines[1].strip())
items_labels = [x.strip() for x in lines[2:]]
items = items_labels[1::2]
labels = items_labels[::2]
return items, labels, inputSize, outputSize
#flatten a list
def flatten(xss):
return [x for xs in xss for x in xs]
#find how many classes we have in data:
def find_unique_classes(*inputlists):
unique_classes = np.unique(flatten([*inputlists])).tolist()
unique_classes = [str(x) for x in unique_classes]
number_of_unique_classes = len(unique_classes)
return unique_classes, number_of_unique_classes
def find_unique_symbols(*inputlists):
flat_list = flatten([*inputlists])
maxlen = len(max(flat_list, key=len))
unique_elem_list = []
for elem in flat_list:
tokens = [x for x in elem.split(' ')]
for t in tokens:
if not t in unique_elem_list:
unique_elem_list.append(t)
return maxlen, unique_elem_list + [' ']
def pad_sequences(sequences, maxlen=None, dtype='int32',
padding='pre', truncating='pre', value=0.):
if not hasattr(sequences, '__len__'):
raise ValueError('`sequences` must be iterable.')
lengths = []
for x in sequences:
if not hasattr(x, '__len__'):
raise ValueError('`sequences` must be a list of iterables. '
'Found non-iterable: ' + str(x))
lengths.append(len(x))
num_samples = len(sequences)
#print("NUM SAMPLES", num_samples)
if maxlen is None:
maxlen = np.max(lengths)
# take the sample shape from the first non empty sequence
# checking for consistency in the main loop below.
sample_shape = tuple()
for s in sequences:
if len(s) > 0:
sample_shape = np.asarray(s).shape[1:]
break
is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)
if isinstance(value, six.string_types) and dtype != object and not is_dtype_str:
raise ValueError("`dtype` {} is not compatible with `value`'s type: {}\n"
"You should set `dtype=object` for variable length strings."
.format(dtype, type(value)))
x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
for idx, s in enumerate(sequences):
if not len(s):
continue # empty list/array was found
if truncating == 'pre':
trunc = s[-maxlen:]
elif truncating == 'post':
trunc = s[:maxlen]
else:
raise ValueError('Truncating type "%s" '
'not understood' % truncating)
# check `trunc` has expected shape
trunc = np.asarray(trunc, dtype=dtype)
if trunc.shape[1:] != sample_shape:
raise ValueError('Shape of sample %s of sequence at position %s '
'is different from expected shape %s' %
(trunc.shape[1:], idx, sample_shape))
if padding == 'post':
x[idx, :len(trunc)] = trunc
elif padding == 'pre':
x[idx, -len(trunc):] = trunc
else:
raise ValueError('Padding type "%s" not understood' % padding)
return x
def to_categorical(y, num_classes=None, dtype='float32'):
y = np.array(y, dtype='int')
input_shape = y.shape
if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
input_shape = tuple(input_shape[:-1])
y = y.ravel()
if not num_classes:
num_classes = np.max(y) + 1
n = y.shape[0]
categorical = np.zeros((n, num_classes), dtype=dtype)
categorical[np.arange(n), y] = 1
output_shape = input_shape + (num_classes,)
categorical = np.reshape(categorical, output_shape)
return categorical
def make_pytorch_dicts(*input_paths):
input_paths = [x for x in input_paths]
item_lsts = []
label_lsts = []
for inpath in input_paths:
items_list, labels_list, _, _ = readFile_cff(inpath)
item_lsts.append(items_list)
label_lsts.append(labels_list)
items_list = flatten(item_lsts)
labels_list = flatten(label_lsts)
classes, n_classes = find_unique_classes(labels_list) # possible moves
maxlen, all_symbols = find_unique_symbols(items_list) # basically, the number of POS tags
n_symbols = len(all_symbols)
#print('Number of classes: ', n_classes)
#print('Number of symbols: ', n_symbols)
#print('Max length of sequence: ', maxlen)
symbol_to_idx = {s: n for n, s in enumerate(all_symbols)}
idx_to_symbol = {n: s for n, s in enumerate(all_symbols)}
class_to_idx = {c: n for n, c in enumerate(classes)}
idx_to_class = {n: c for n, c in enumerate(classes)}
return n_classes, maxlen, n_symbols, symbol_to_idx, idx_to_symbol, class_to_idx, idx_to_class, classes
def encode_x_batch(x_batch, symbol_to_idx, n_symbols):
return pad_sequences([encode_x(x, symbol_to_idx, n_symbols) for x in x_batch],maxlen=15) # TODO read maxlen from the file
def encode_x(x, symbol_to_idx, n_symbols):
idx_x = [symbol_to_idx[s] for s in x]
return to_categorical(idx_x, num_classes=n_symbols)
def encode_y_batch(y_batch, class_to_idx, n_classes):
return np.array([encode_y(y, class_to_idx, n_classes) for y in y_batch])
def encode_y(y, class_to_idx, n_classes):
idx_y = class_to_idx[y]
return to_categorical(idx_y, num_classes=n_classes)
def preprocess_data(items_list, labels_list, batch_size, symbol_to_idx, class_to_idx, n_symbols, n_classes):
upp_bound = int(math.ceil(len(items_list) / batch_size))
data_batches = []
i = 0
for i in range(upp_bound -1):
a = i * batch_size
b = (i + 1) * batch_size
data_batches.append((encode_x_batch(items_list[a:b], symbol_to_idx, n_symbols), encode_y_batch(labels_list[a:b], class_to_idx, n_classes)))
i+=1
return data_batches
def decode_x(x, idx_to_symbol):
x = x[np.sum(x, axis=1) > 0] # remove padding
return ''.join([idx_to_symbol[pos] for pos in np.argmax(x, axis=1)])
def decode_y(y, idx_to_class):
return idx_to_class[np.argmax(y)]
def decode_x_batch(x_batch, idx_to_symbol):
return [decode_x(x, idx_to_symbol) for x in x_batch]
def decode_y_batch(y_batch, idx_to_class):
return [idx_to_class[pos] for pos in np.argmax(y_batch, axis=1)]
\ No newline at end of file
......@@ -9,6 +9,7 @@ from FeatModel import FeatModel
import torch
import numpy as np
from pytorch_utils import *
......@@ -23,9 +24,9 @@ def prepareWordBufferForDecode(buffer):
word.setFeat('LABEL', Word.invalidLabel())
verbose = False
if len(sys.argv) != 7 :
print('usage:', sys.argv[0], 'mcf_file model_file dicos_file feat_model mcd_file words_limit')
verbose = True
if len(sys.argv) != 9 :
print('usage:', sys.argv[0], 'mcf_file model_file dicos_file feat_model mcd_file words_limit train_file dev_file')
exit(1)
mcf_file = sys.argv[1]
......@@ -34,76 +35,178 @@ dicos_file = sys.argv[3]
feat_model = sys.argv[4]
mcd_file = sys.argv[5]
wordsLimit = int(sys.argv[6])
train_fr_file = sys.argv[7]
dev_fr_file = sys.argv[8]
##########################################################################
sys.stderr.write('reading mcd from file :')
sys.stderr.write(mcd_file)
sys.stderr.write('\n')
mcd = Mcd(mcd_file)
n_classes, maxlen, n_symbols, symbol_to_idx, idx_to_symbol, class_to_idx, idx_to_class, _ = make_pytorch_dicts(dev_fr_file, train_fr_file)
##########################################################################
sys.stderr.write('loading dicos\n')
dicos = Dicos(fileName=dicos_file)
#print('reading mcd from file : ', mcd_file) # PGLE.mcd
mcd = Mcd(mcd_file) # MCD = multi column descriptions
#print('loading dicos from : ', dicos_file) # PLE.dic
dicos = Dicos(fileName=dicos_file) # dictionaries with class labels for columns from mcd
moves = Moves(dicos)
sys.stderr.write('reading feature model from file :')
sys.stderr.write(feat_model)
sys.stderr.write('\n')
#print('reading feature model from file : ', feat_model) # basic.fm
featModel = FeatModel(feat_model, dicos)
sys.stderr.write('loading model :')
sys.stderr.write(model_file)
sys.stderr.write('\n')
model = load_model(model_file)
#print('loading pytorch model from :', model_file) # /home/taniabladier/Programming/AMU/tbp/expe/out/fr.pytorch
############################
"""
load saved pytorch model
"""
batch_size = 32
# Setup the RNN and training settings
import torch.nn as nn
import torch.nn.functional as F
class SimpleLSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.lstm = torch.nn.LSTM(input_size, hidden_size, batch_first=True)
self.linear = torch.nn.Linear(hidden_size, output_size)
def forward(self, x):
h = self.lstm(x)[0]
x = self.linear(h)
return x
def get_states_across_time(self, x):
h_c = None
h_list, c_list = list(), list()
with torch.no_grad():
for t in range(x.size(1)):
h_c = self.lstm(x[:, [t], :], h_c)[1]
h_list.append(h_c[0])
c_list.append(h_c[1])
h = torch.cat(h_list)
c = torch.cat(c_list)
return h, c
input_size = 133 #n_symbols
hidden_size = 128
output_size = 75 #n_classes
print('input output', input_size, output_size)
model = SimpleLSTM(input_size, hidden_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
checkpoint = torch.load(model_file, map_location=torch.device('cpu'), weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputSize = featModel.getInputSize()
outputSize = moves.getNb()
model = model.to(device)
model.eval();
############################
#model = load_model(model_file)
#inputSize = featModel.getInputSize()
#outputSize = moves.getNb()
c = Config(mcf_file, mcd, dicos)
numSent = 0
verbose = False
verbose = True
numWords = 0
while c.getBuffer().readNextSentence() and numWords < wordsLimit :
c.getStack().empty()
prepareWordBufferForDecode(c.getBuffer())
numWords += c.getBuffer().getLength()
while True :
featVec = c.extractFeatVec(featModel)
inputVector = featModel.buildInputVector(featVec, dicos)
outputVector = model.predict(inputVector.reshape((1,inputSize)), batch_size=1, verbose=0, steps=None)
mvt_Code = outputVector.argmax()
###############
inputVector = ' '.join(str(x) for x in inputVector)
inputVector = encode_x_batch([inputVector], symbol_to_idx, 133) #n_symbols)
inputVector = torch.from_numpy(inputVector).float().to(device)
with torch.no_grad():
output = model(inputVector)
# Pick only the output corresponding to last sequence element (input is pre padded)
output = output[:, -1, :]
preds = output.argmax(dim=1)
mvt_Code = int([idx_to_class[y.item()] for y in preds][0])
##################
#mvt_Code = outputVector.argmax()
#mvt = moves.mvtDecode(mvt_Code)
mvt = moves.mvtDecode(mvt_Code)
verbose = True
if(verbose == True) :
print("------------------------------------------")
"""
print("\n\n------------------------------------------\n")
print('inputVector ::', featModel.buildInputVector(featVec, dicos))
print('mvt code: ', mvt_Code, ' move ::: ', mvt)
c.affiche()
print('predicted move', mvt[0], mvt[1])
print(mvt, featVec)
res = c.applyMvt(mvt)
print('predicted move: ', mvt[0], mvt[1], 'for', str(featVec))
"""
res = c.applyMvt(mvt) #result, True or False
#print("RES", res)
if not res :
sys.stderr.write("cannot apply predicted movement\n")
print("cannot apply predicted movement\n")
mvt_type = mvt[0]
mvt_label = mvt[1]
if mvt_type != "SHIFT" :
sys.stderr.write("try to force SHIFT\n")
print("try to force SHIFT\n")
res = c.shift()
if res == False :
sys.stderr.write("try to force REDUCE\n")
print("try to force REDUCE\n")
res = c.red()
if res == False :
sys.stderr.write("abort sentence\n")
print("abort sentence\n")
break
if(c.isFinal()):
break
for i in range(1, c.getBuffer().getLength()):
w = c.getBuffer().getWord(i)
w.affiche(mcd)
print('')
# print('\t', w.getFeat("GOV"), end='\t')
# print(w.getFeat("LABEL"))
numSent += 1
# if numSent % 10 == 0:
......
import sys
import numpy as np
import torch
from torch import nn
import torch.nn as nn
import torch.nn.functional as F
from pytorch_utils import *
from plot_lib import *
import os
"""## 1. Reading Data Files"""
"""
def readData(dataFilename) :
allX = []
allY = []
......@@ -37,6 +44,7 @@ def readData(dataFilename) :
x_train = np.array(allX)
y_train = np.array(allY)
return (inputSize, outputSize, x_train, y_train)
"""
......@@ -46,8 +54,437 @@ if len(sys.argv) < 3 :
cffTrainFileName = sys.argv[1]
cffDevFileName = sys.argv[2]
kerasModelFileName = sys.argv[3]
pytorchModelFileName = sys.argv[3]
batch_size = 32
n_classes, maxlen, n_symbols, symbol_to_idx, idx_to_symbol, \
class_to_idx, idx_to_class, classes = make_pytorch_dicts(cffTrainFileName, cffDevFileName)
train_items_list, train_labels_list, train_inputSize, train_outputSize = readFile_cff(cffTrainFileName)
dev_items_list, dev_labels_list, dev_inputSize, dev_outputSize = readFile_cff(cffDevFileName)
train_data_gen = preprocess_data(train_items_list[:800000], train_labels_list[:800000], batch_size, symbol_to_idx, class_to_idx, train_inputSize, train_outputSize)#, n_symbols, n_classes)
dev_data_gen = preprocess_data(dev_items_list[:200000], dev_labels_list[:200000], batch_size, symbol_to_idx, class_to_idx, train_inputSize, train_outputSize)#, n_symbols, n_classes)
"""## 2. Defining the Model"""
# Set the random seed for reproducible results
torch.manual_seed(1)
# Setup the RNN and training settings
input_size = 133 #train_inputSize #n_symbols
hidden_size = 128
output_size = 75 #train_outputSize #n_classes
class SimpleMLP(nn.Module):
def __init__(self, input_size, output_size):
super(SimpleMLP, self).__init__()
self.fc1 = nn.Linear(input_size, 200) # Dense layer with 128 units
self.dropout = nn.Dropout(0.5) # Dropout with 0.4 probability
self.fc2 = nn.Linear(200, output_size) # Dense layer with outputSize units
def forward(self, x):
x = F.relu(self.fc1(x)) # Apply ReLU activation
x = self.dropout(x) # Apply Dropout
x = F.softmax(self.fc2(x), dim=1) # Apply Softmax activation
return x
def get_states_across_time(self, x):
h_c = None
h_list, c_list = list(), list()
with torch.no_grad():
for t in range(x.size(1)):
h_c = self.lstm(x[:, [t], :], h_c)[1]
h_list.append(h_c[0])
c_list.append(h_c[1])
h = torch.cat(h_list)
c = torch.cat(c_list)
return h, c
class SimpleRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
# This just calls the base class constructor
super().__init__()
# Neural network layers assigned as attributes of a Module subclass
# have their parameters registered for training automatically.
self.rnn = torch.nn.RNN(input_size, hidden_size, nonlinearity='relu', batch_first=True)
self.linear = torch.nn.Linear(hidden_size, output_size)
def forward(self, x):
# The RNN also returns its hidden state but we don't use it.
# While the RNN can also take a hidden state as input, the RNN
# gets passed a hidden state initialized with zeros by default.
h = self.rnn(x)[0]
x = self.linear(h)
return x
class SimpleLSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.lstm = torch.nn.LSTM(input_size, hidden_size, batch_first=True)
self.linear = torch.nn.Linear(hidden_size, output_size)
def forward(self, x):
h = self.lstm(x)[0]
x = self.linear(h)
return x
def get_states_across_time(self, x):
h_c = None
h_list, c_list = list(), list()
with torch.no_grad():
for t in range(x.size(1)):
h_c = self.lstm(x[:, [t], :], h_c)[1]
h_list.append(h_c[0])
c_list.append(h_c[1])
h = torch.cat(h_list)
c = torch.cat(c_list)
return h, c
"""## 3. Defining the Training Loop"""
def train(model, train_data_gen, criterion, optimizer, device):
# Set the model to training mode. This will turn on layers that would
# otherwise behave differently during evaluation, such as dropout.
model.train()
# Store the number of sequences that were classified correctly
num_correct = 0
# Iterate over every batch of sequences. Note that the length of a data generator
# is defined as the number of batches required to produce a total of roughly 1000
# sequences given a batch size.
for batch_idx in range(len(train_data_gen)):
# Request a batch of sequences and class labels, convert them into tensors
# of the correct type, and then send them to the appropriate device.
data, target = train_data_gen[batch_idx]
data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)
# Perform the forward pass of the model
output = model(data) # Step ①
# Pick only the output corresponding to last sequence element (input is pre padded)
output = output[:, -1, :]
# Compute the value of the loss for this batch. For loss functions like CrossEntropyLoss,
# the second argument is actually expected to be a tensor of class indices rather than
# one-hot encoded class labels. One approach is to take advantage of the one-hot encoding
# of the target and call argmax along its second dimension to create a tensor of shape
# (batch_size) containing the index of the class label that was hot for each sequence.
target = target.argmax(dim=1)
loss = criterion(output, target) # Step ②
# Clear the gradient buffers of the optimized parameters.
# Otherwise, gradients from the previous batch would be accumulated.
optimizer.zero_grad() # Step ③
loss.backward() # Step ④
optimizer.step() # Step ⑤
y_pred = output.argmax(dim=1)
num_correct += (y_pred == target).sum().item()
return num_correct, loss.item()
"""## 4. Defining the Testing Loop"""
def test(model, test_data_gen, criterion, device):
# Set the model to evaluation mode. This will turn off layers that would
# otherwise behave differently during training, such as dropout.
model.eval()
# Store the number of sequences that were classified correctly
num_correct = 0
# A context manager is used to disable gradient calculations during inference
# to reduce memory usage, as we typically don't need the gradients at this point.
with torch.no_grad():
for batch_idx in range(len(test_data_gen)):
data, target = test_data_gen[batch_idx]
data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)
output = model(data)
# Pick only the output corresponding to last sequence element (input is pre padded)
output = output[:, -1, :]
target = target.argmax(dim=1)
loss = criterion(output, target)
y_pred = output.argmax(dim=1)
num_correct += (y_pred == target).sum().item()
return num_correct, loss.item()
"""## 5. Putting it All Together"""
import matplotlib.pyplot as plt
from plot_lib import set_default, plot_state, print_colourbar
set_default()
def check_point(model, filename, epochs, optimizer, criterion):
#torch.save(model.state_dict(), filename)
print(f"Saving model from epoch", epochs)
torch.save({
'epoch': epochs,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': criterion,
}, filename)
def resume(model, filename):
#model.load_state_dict(torch.load(filename))
checkpoint = torch.load(filename, map_location=torch.device('cpu'), weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
def train_and_test(model, train_data_gen, test_data_gen, criterion, optimizer, max_epochs, verbose=True):
# Automatically determine the device that PyTorch should use for computation
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Move model to the device which will be used for train and test
model.to(device)
# Track the value of the loss function and model accuracy across epochs
history_train = {'loss': [], 'acc': []}
history_test = {'loss': [], 'acc': []}
early_stop_thresh = 5
best_accuracy = -1
best_epoch = -1
for epoch in range(max_epochs):
# Run the training loop and calculate the accuracy.
# Remember that the length of a data generator is the number of batches,
# so we multiply it by the batch size to recover the total number of sequences.
num_correct, loss = train(model, train_data_gen, criterion, optimizer, device)
accuracy = float(num_correct) / (len(train_data_gen) * batch_size) * 100
history_train['loss'].append(loss)
history_train['acc'].append(accuracy)
# Do the same for the testing loop
num_correct, loss = test(model, test_data_gen, criterion, device)
accuracy = float(num_correct) / (len(test_data_gen) * batch_size) * 100
history_test['loss'].append(loss)
history_test['acc'].append(accuracy)
if history_test['acc'][-1] > best_accuracy:
best_accuracy = history_test['acc'][-1]
best_epoch = epoch
print('epoch', epoch)
check_point(model, pytorchModelFileName, epoch, optimizer, criterion)
elif epoch - best_epoch > early_stop_thresh:
print("Early stopped training at epoch %d" % epoch)
break # terminate the training loop
if verbose or epoch + 1 == max_epochs:
print(f'[Epoch {epoch + 1}/{max_epochs}]'
f" loss: {history_train['loss'][-1]:.4f}, acc: {history_train['acc'][-1]:2.2f}%"
f" - test_loss: {history_test['loss'][-1]:.4f}, test_acc: {history_test['acc'][-1]:2.2f}%")
resume(model, pytorchModelFileName)
# Generate diagnostic plots for the loss and accuracy
fig, axes = plt.subplots(ncols=2, figsize=(9, 4.5))
for ax, metric in zip(axes, ['loss', 'acc']):
ax.plot(history_train[metric])
ax.plot(history_test[metric])
ax.set_xlabel('epoch', fontsize=12)
ax.set_ylabel(metric, fontsize=12)
ax.legend(['Train', 'Test'], loc='best')
plt.savefig(os.path.abspath('..') + '/expe/out/loss_accuracy.png')
#plt.show()
return model
"""## 5. Simple RNN: 10 Epochs"""
"""
# Setup the training and test data generators
# Setup the RNN and training settings
input_size = n_symbols
hidden_size = 128
output_size = n_classes
model = SimpleRNN(input_size, hidden_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
max_epochs = 100
# Train the model
model = train_and_test(model, train_data_gen, dev_data_gen, criterion, optimizer, max_epochs)
for parameter_group in list(model.parameters()):
print(parameter_group.size())
"""
"""## 6b SimpleMLP: 10 Epochs
"""
"""
# Setup the MLP and training settings
input_size = n_symbols
output_size = n_classes
model = SimpleMLP(input_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
optimizer = torch.optim.Adamax(model.parameters(), lr=0.01)
max_epochs = 30
# Train the model
model = train_and_test(model, train_data_gen, dev_data_gen, criterion, optimizer, max_epochs)
for parameter_group in list(model.parameters()):
print(parameter_group.size())
"""
"""## 6. Simple LSTM: 10 Epochs"""
model = SimpleLSTM(input_size, hidden_size, output_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
max_epochs = 30
# Train the model
model = train_and_test(model, train_data_gen, dev_data_gen, criterion, optimizer, max_epochs)
#for parameter_group in list(model.parameters()):
# print(parameter_group.size())
"""## 7. Model Evaluation"""
import collections
import random
def evaluate_model(model, seed=9001, verbose=False):
# Define a dictionary that maps class indices to labels
#class_idx_to_label = {0: 'Q', 1: 'R', 2: 'S', 3: 'U', 4: '0', 5: '8'}
class_idx_to_label = {n: c for n, c in enumerate(classes)}
# Create a new data generator
#data_generator = QRSU.prepare_data(seed=seed)
data_generator = preprocess_data(dev_items_list, dev_labels_list, batch_size, symbol_to_idx, class_to_idx, input_size, output_size)#, n_symbols, n_classes)
# Track the number of times a class appears
count_classes = collections.Counter()
# Keep correctly classified and misclassified sequences, and their
# true and predicted class labels, for diagnostic information.
correct = []
incorrect = []
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.eval()
with torch.no_grad():
for batch_idx in range(len(data_generator)):
data, target = data_generator[batch_idx]
data, target = torch.from_numpy(data).float().to(device), torch.from_numpy(target).long().to(device)
data_decoded = decode_x_batch(data.cpu().numpy(), idx_to_symbol)
target_decoded = decode_y_batch(target.cpu().numpy(), idx_to_class)
output = model(data)
output = output[:, -1, :]
target = target.argmax(dim=1)
y_pred = output.argmax(dim=1)
y_pred_decoded = [class_idx_to_label[y.item()] for y in y_pred]
count_classes.update(target_decoded)
for i, (truth, prediction) in enumerate(zip(target_decoded, y_pred_decoded)):
if truth == prediction:
correct.append((data_decoded[i], truth, prediction))
else:
incorrect.append((data_decoded[i], truth, prediction))
num_sequences = sum(count_classes.values())
print('len correct', len(correct))
print('num seqs', num_sequences)
accuracy = float(len(correct)) / num_sequences * 100
print(f'The accuracy of the model is measured to be {accuracy:.2f}%.\n')
# Report the accuracy by class
for label in sorted(count_classes):
num_correct = sum(1 for _, truth, _ in correct if truth == label)
#print(f'{label}: {num_correct} / {count_classes[label]} correct')
print("Number of correct predictions: ", len(correct))
# Report some random sequences for examination
print('\nHere are some example sequences:')
if len(correct) > 0:
for i in range(0,3):
sequence, truth, prediction = correct[random.randrange(0, 2)]
#print(f'{sequence} -> {truth} was labelled {prediction}')
print("Number of incorrect predictions: ", len(incorrect))
# Report misclassified sequences for investigation
if len(incorrect) > 0:
print('\nThe following sequences were misclassified:')
for i in range(0,3):
sequence, truth, prediction = incorrect[random.randrange(0, 2)]
print(f'{sequence} -> {truth} was labelled {prediction}')
else:
print('\nThere were no misclassified sequences.')
evaluate_model(model)
""" Visualize Model """
# Get hidden (H) and cell (C) batch state given a batch input (X)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.eval()
with torch.no_grad():
data = dev_data_gen[0][0]
X = torch.from_numpy(data).float().to(device)
H_t, C_t = model.get_states_across_time(X)
#print("Color range is as follows:")
#print_colourbar()
#plot_state(X.cpu(), C_t, b=31, decoder=decode_x, idx_to_symbol=idx_to_symbol) # 3, 6, 9
#plot_state(X.cpu(), H_t, b=31, decoder=decode_x, idx_to_symbol=idx_to_symbol) #b=9
"""
inputSize, outputSize, x_train, y_train = readData(cffTrainFileName)
devInputSize, devOutputSize, x_dev, y_dev = readData(cffDevFileName)
model = mlp()
......@@ -75,4 +512,4 @@ model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_dev,y_d
model.save(kerasModelFileName)
"""