Skip to content
Snippets Groups Projects
Commit f47c31e2 authored by Tania Bladier's avatar Tania Bladier
Browse files

add bib for tokenization literature

parent cde5c2f5
No related branches found
No related tags found
No related merge requests found
File added
File added
This diff is collapsed.
File added
...@@ -1000,6 +1000,30 @@ ...@@ -1000,6 +1000,30 @@
groups = {Groupe de lecture}, groups = {Groupe de lecture},
} }
@InProceedings{vilar2021statistical,
author = {Vilar, David and Federico, Marcello},
booktitle = {Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021)},
title = {A statistical extension of byte-pair encoding},
year = {2021},
pages = {263--275},
file = {:2021.iwslt-1.31.pdf:PDF},
groups = {Tokenization},
printed = {printed},
}
@Article{rey2024chomsky,
author = {Rey, Arnaud},
journal = {L’Ann{\'e}e psychologique},
title = {Chomsky’s error on language and implicit statistical learning: Introduction to the special issue},
year = {2024},
number = {3},
pages = {277--282},
volume = {124},
file = {:1.Introduction_Special_Issue.pdf:PDF},
groups = {Groupe de lecture},
publisher = {Presses Universitaires de France},
}
@Comment{jabref-meta: databaseType:bibtex;} @Comment{jabref-meta: databaseType:bibtex;}
@Comment{jabref-meta: grouping: @Comment{jabref-meta: grouping:
......
@Article{toraman2023impact,
author = {Toraman, Cagri and Yilmaz, Eyup Halit and {\c{S}}ahinu{\c{c}}, Furkan and Ozcelik, Oguzhan},
journal = {ACM Transactions on Asian and Low-Resource Language Information Processing},
title = {Impact of tokenization on language models: An analysis for turkish},
year = {2023},
number = {4},
pages = {1--21},
volume = {22},
file = {:3578707.pdf:PDF},
groups = {Tokenization},
publisher = {ACM New York, NY},
}
@Article{edman2024cute,
author = {Edman, Lukas and Schmid, Helmut and Fraser, Alexander},
journal = {arXiv preprint arXiv:2409.15452},
title = {CUTE: Measuring LLMs' Understanding of Their Tokens},
year = {2024},
file = {:2409.15452v1.pdf:PDF},
groups = {Tokenization},
}
@InProceedings{weller-di-marco-fraser-2024-analyzing,
author = {Weller-Di Marco, Marion and Fraser, Alexander},
booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)},
title = {Analyzing the Understanding of Morphologically Complex Words in Large Language Models},
year = {2024},
address = {Torino, Italia},
editor = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen},
month = may,
pages = {1009--1020},
publisher = {ELRA and ICCL},
abstract = {We empirically study the ability of a Large Language Model (gpt-3.5-turbo-instruct) to understand morphologically complex words. In our experiments, we looked at a variety of tasks to analyse German compounds with regard to compositional word formation and derivation, such as identifying the head noun of existing and novel compounds, identifying the shared verb stem between two words, or recognizing words constructed with inappropriately used derivation morphemes as invalid. Our results show that the language model is generally capable of solving most tasks, except for the task of identifying ill-formed word forms. While the model demonstrated a good overall understanding of complex words and their word-internal structure, the results also suggest that there is no formal knowledge of derivational rules, but rather an interpretation of the observed word parts to derive the meaning of a word.},
file = {:2024.lrec-main.90.pdf:PDF},
groups = {Tokenization},
url = {https://aclanthology.org/2024.lrec-main.90},
}
@Article{alrefaie2024exploring,
author = {Alrefaie, Mohamed Taher and Morsy, Nour Eldin and Samir, Nada},
journal = {arXiv preprint arXiv:2403.11130},
title = {Exploring Tokenization Strategies and Vocabulary Sizes for Enhanced Arabic Language Models},
year = {2024},
file = {:2403.11130v2.pdf:PDF},
groups = {Tokenization},
}
@Article{Luitel2024CanPP,
author = {Nishant Luitel and Nirajan Bekoju and Anand Kumar Sah and Subarna Shakya},
journal = {ArXiv},
title = {Can Perplexity Predict Fine-Tuning Performance? An Investigation of Tokenization Effects on Sequential Language Models for Nepali},
year = {2024},
volume = {abs/2404.18071},
file = {:2404.18071v1.pdf:PDF},
groups = {Tokenization},
url = {https://api.semanticscholar.org/CorpusID:269449398},
}
@Article{alyafeai2023evaluating,
author = {Alyafeai, Zaid and Al-shaibani, Maged S and Ghaleb, Mustafa and Ahmad, Irfan},
journal = {Neural Processing Letters},
title = {Evaluating various tokenizers for Arabic text classification},
year = {2023},
number = {3},
pages = {2911--2933},
volume = {55},
file = {:s11063-022-10990-8.pdf:PDF},
groups = {Tokenization},
publisher = {Springer},
}
@Article{lindsey2024comparison,
author = {Lindsey, LeAnn M and Pershing, Nicole L and Habib, Anisa and Stephens, W Zac and Blaschke, Anne J and Sundar, Hari},
journal = {bioRxiv},
title = {A Comparison of Tokenization Impact in Attention Based and State Space Genomic Language Models},
year = {2024},
pages = {2024--09},
file = {:2024.09.09.612081v1.full.pdf:PDF},
groups = {Tokenization},
publisher = {Cold Spring Harbor Laboratory},
}
@InProceedings{benamar2022evaluating,
author = {Benamar, Alexandra and Grouin, Cyril and Bothua, Meryl and Vilnat, Anne},
booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference},
title = {Evaluating tokenizers impact on OOVs representation with transformers models},
year = {2022},
pages = {4193--4204},
file = {:2022.lrec-1.445.pdf:PDF},
groups = {Tokenization},
}
@Article{Dotan_Effect_of_Tokenization_2024,
author = {Dotan, Edo and Jaschek, Gal and Pupko, Tal and Belinkov, Yonatan},
journal = {Bioinformatics},
title = {{Effect of tokenization on transformers for biological sequences}},
year = {2024},
issn = {1367-4811},
month = {04},
pages = {btae196},
abstract = {{Deep-learning models are transforming biological research, including many bioinformatics and comparative genomics algorithms, such as sequence alignments, phylogenetic tree inference, and automatic classification of protein functions. Among these deep-learning algorithms, models for processing natural languages, developed in the natural language processing (NLP) community, were recently applied to biological sequences. However, biological sequences are different from natural languages, such as English, and French, in which segmentation of the text to separate words is relatively straightforward. Moreover, biological sequences are characterized by extremely long sentences, which hamper their processing by current machine-learning models, notably the transformer architecture. In NLP, one of the first processing steps is to transform the raw text to a list of tokens. Deep-learning applications to biological sequence data mostly segment proteins and DNA to single characters. In this work, we study the effect of alternative tokenization algorithms on eight different tasks in biology, from predicting the function of proteins and their stability, through nucleotide sequence alignment, to classifying proteins to specific families.We demonstrate that applying alternative tokenization algorithms can increase accuracy and at the same time, substantially reduce the input length compared to the trivial tokenizer in which each character is a token. Furthermore, applying these tokenization algorithms allows interpreting trained models, taking into account dependencies among positions. Finally, we trained these tokenizers on a large dataset of protein sequences containing more than 400 billion amino acids, which resulted in over a three-fold decrease in the number of tokens. We then tested these tokenizers trained on large-scale data on the above specific tasks and showed that for some tasks it is highly beneficial to train database-specific tokenizers. Our study suggests that tokenizers are likely to be a critical component in future deep-network analysis of biological sequence data.Code, data and trained tokenizers are available on https://github.com/technion-cs-nlp/BiologicalTokenizers.}},
doi = {10.1093/bioinformatics/btae196},
eprint = {https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btae196/57226869/btae196.pdf},
file = {:btae196.pdf:PDF},
groups = {Tokenization},
url = {https://doi.org/10.1093/bioinformatics/btae196},
}
@InProceedings{kamps2020impact,
author = {Kamps, Jaap and Kondylidis, Nikolaos and Rau, David and others},
booktitle = {TREC},
title = {Impact of Tokenization, Pretraining Task, and Transformer Depth on Text Ranking.},
year = {2020},
file = {:UAmsterdam.DL.pdf:PDF},
groups = {Tokenization},
}
@Article{xu2020vocabulary,
author = {Xu, Jingjing and Zhou, Hao and Gan, Chun and Zheng, Zaixiang and Li, Lei},
journal = {arXiv preprint arXiv:2012.15671},
title = {Vocabulary learning via optimal transport for neural machine translation},
year = {2020},
file = {:2021.acl-long.571.pdf:PDF},
groups = {Tokenization},
}
@Article{ahia2023all,
author = {Ahia, Orevaoghene and Kumar, Sachin and Gonen, Hila and Kasai, Jungo and Mortensen, David R and Smith, Noah A and Tsvetkov, Yulia},
journal = {arXiv preprint arXiv:2305.13707},
title = {Do all languages cost the same? tokenization in the era of commercial language models},
year = {2023},
file = {:2023.emnlp-main.614.pdf:PDF},
groups = {Tokenization},
}
@Article{mielke2021between,
author = {Mielke, Sabrina J and Alyafeai, Zaid and Salesky, Elizabeth and Raffel, Colin and Dey, Manan and Gall{\'e}, Matthias and Raja, Arun and Si, Chenglei and Lee, Wilson Y and Sagot, Beno{\^\i}t and others},
journal = {arXiv preprint arXiv:2112.10508},
title = {Between words and characters: A brief history of open-vocabulary modeling and tokenization in NLP},
year = {2021},
file = {:2112.10508v1.pdf:PDF},
groups = {Tokenization},
}
@InProceedings{vilar2021statistical,
author = {Vilar, David and Federico, Marcello},
booktitle = {Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021)},
title = {A statistical extension of byte-pair encoding},
year = {2021},
pages = {263--275},
file = {:2021.iwslt-1.31.pdf:PDF},
groups = {Tokenization},
printed = {printed},
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment