update tok literature

e4a846d2 · Tania Bladier · 334570b6 · e4a846d2 · e4a846d2 · e4a846d2
Commit e4a846d2 authored 9 months ago by Tania Bladier
--- a/1706.03762v7.pdf
+++ b/1706.03762v7.pdf
--- a/2005.14165v4.pdf
+++ b/2005.14165v4.pdf
--- a/2022.lrec-1.445.pdf
+++ b/2022.lrec-1.445.pdf
--- a/UAmsterdam.DL.pdf
+++ b/UAmsterdam.DL.pdf
--- a/amu-literature.bib
+++ b/amu-literature.bib
@@ -58,6 +58,7 @@
  title   = {{Attention is all you need}},
  year    = {2017},
  volume  = {30},
+  file    = {:1706.03762v7.pdf:PDF},
  groups  = {ml-architechtures},
 }

@@ -880,6 +881,50 @@
  publisher = {Cold Spring Harbor Laboratory},
 }

+@Article{brown2020language,
+  author  = {Brown, Tom B},
+  journal = {arXiv preprint arXiv:2005.14165},
+  title   = {Language models are few-shot learners},
+  year    = {2020},
+  file    = {:2005.14165v4.pdf:PDF},
+  groups  = {ml-architechtures, read-asap},
+}
+
+@InProceedings{benamar2022evaluating,
+  author    = {Benamar, Alexandra and Grouin, Cyril and Bothua, Meryl and Vilnat, Anne},
+  booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference},
+  title     = {Evaluating tokenizers impact on OOVs representation with transformers models},
+  year      = {2022},
+  pages     = {4193--4204},
+  file      = {:2022.lrec-1.445.pdf:PDF},
+  groups    = {Tokenization},
+}
+
+@Article{Dotan_Effect_of_Tokenization_2024,
+  author   = {Dotan, Edo and Jaschek, Gal and Pupko, Tal and Belinkov, Yonatan},
+  journal  = {Bioinformatics},
+  title    = {{Effect of tokenization on transformers for biological sequences}},
+  year     = {2024},
+  issn     = {1367-4811},
+  month    = {04},
+  pages    = {btae196},
+  abstract = {{Deep-learning models are transforming biological research, including many bioinformatics and comparative genomics algorithms, such as sequence alignments, phylogenetic tree inference, and automatic classification of protein functions. Among these deep-learning algorithms, models for processing natural languages, developed in the natural language processing (NLP) community, were recently applied to biological sequences. However, biological sequences are different from natural languages, such as English, and French, in which segmentation of the text to separate words is relatively straightforward. Moreover, biological sequences are characterized by extremely long sentences, which hamper their processing by current machine-learning models, notably the transformer architecture. In NLP, one of the first processing steps is to transform the raw text to a list of tokens. Deep-learning applications to biological sequence data mostly segment proteins and DNA to single characters. In this work, we study the effect of alternative tokenization algorithms on eight different tasks in biology, from predicting the function of proteins and their stability, through nucleotide sequence alignment, to classifying proteins to specific families.We demonstrate that applying alternative tokenization algorithms can increase accuracy and at the same time, substantially reduce the input length compared to the trivial tokenizer in which each character is a token. Furthermore, applying these tokenization algorithms allows interpreting trained models, taking into account dependencies among positions. Finally, we trained these tokenizers on a large dataset of protein sequences containing more than 400 billion amino acids, which resulted in over a three-fold decrease in the number of tokens. We then tested these tokenizers trained on large-scale data on the above specific tasks and showed that for some tasks it is highly beneficial to train database-specific tokenizers. Our study suggests that tokenizers are likely to be a critical component in future deep-network analysis of biological sequence data.Code, data and trained tokenizers are available on https://github.com/technion-cs-nlp/BiologicalTokenizers.}},
+  doi      = {10.1093/bioinformatics/btae196},
+  eprint   = {https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btae196/57226869/btae196.pdf},
+  file     = {:btae196.pdf:PDF},
+  groups   = {Tokenization},
+  url      = {https://doi.org/10.1093/bioinformatics/btae196},
+}
+
+@InProceedings{kamps2020impact,
+  author    = {Kamps, Jaap and Kondylidis, Nikolaos and Rau, David and others},
+  booktitle = {TREC},
+  title     = {Impact of Tokenization, Pretraining Task, and Transformer Depth on Text Ranking.},
+  year      = {2020},
+  file      = {:UAmsterdam.DL.pdf:PDF},
+  groups    = {Tokenization},
+}
+
 @Comment{jabref-meta: databaseType:bibtex;}

 @Comment{jabref-meta: grouping:

--- a/btae196.pdf
+++ b/btae196.pdf