diff --git a/2021.acl-long.571.pdf b/2021.acl-long.571.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9ee4d045226de6175a3e0c250a6f2016e65ff626 Binary files /dev/null and b/2021.acl-long.571.pdf differ diff --git a/2023.emnlp-main.614.pdf b/2023.emnlp-main.614.pdf new file mode 100644 index 0000000000000000000000000000000000000000..404ac01533b4f9e7541b15e7a404e5d29513194c Binary files /dev/null and b/2023.emnlp-main.614.pdf differ diff --git a/2112.10508v1.pdf b/2112.10508v1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1fe4a1355302b9384ed9851f65c429c8d0bfa272 Binary files /dev/null and b/2112.10508v1.pdf differ diff --git a/amu-literature.bib b/amu-literature.bib index 732173267dc808c157dda3c658541fe8ce7761b4..eb46d1445545c3df41cd15d76f92619f592e5d39 100644 --- a/amu-literature.bib +++ b/amu-literature.bib @@ -964,6 +964,33 @@ year = {2023}, } +@Article{xu2020vocabulary, + author = {Xu, Jingjing and Zhou, Hao and Gan, Chun and Zheng, Zaixiang and Li, Lei}, + journal = {arXiv preprint arXiv:2012.15671}, + title = {Vocabulary learning via optimal transport for neural machine translation}, + year = {2020}, + file = {:2021.acl-long.571.pdf:PDF}, + groups = {Tokenization}, +} + +@Article{ahia2023all, + author = {Ahia, Orevaoghene and Kumar, Sachin and Gonen, Hila and Kasai, Jungo and Mortensen, David R and Smith, Noah A and Tsvetkov, Yulia}, + journal = {arXiv preprint arXiv:2305.13707}, + title = {Do all languages cost the same? tokenization in the era of commercial language models}, + year = {2023}, + file = {:2023.emnlp-main.614.pdf:PDF}, + groups = {Tokenization}, +} + +@Article{mielke2021between, + author = {Mielke, Sabrina J and Alyafeai, Zaid and Salesky, Elizabeth and Raffel, Colin and Dey, Manan and Gall{\'e}, Matthias and Raja, Arun and Si, Chenglei and Lee, Wilson Y and Sagot, Beno{\^\i}t and others}, + journal = {arXiv preprint arXiv:2112.10508}, + title = {Between words and characters: A brief history of open-vocabulary modeling and tokenization in NLP}, + year = {2021}, + file = {:2112.10508v1.pdf:PDF}, + groups = {Tokenization}, +} + @Comment{jabref-meta: databaseType:bibtex;} @Comment{jabref-meta: grouping: