Skip to content
Snippets Groups Projects
Commit bec285c0 authored by Benoit Favre's avatar Benoit Favre
Browse files

first set of scripts for using llama LM

parent 3f6d6764
No related branches found
No related tags found
No related merge requests found
......@@ -6,7 +6,11 @@ Install:
pip install -r requirements.txt
```
Installing bitsandbytes for llama models is a bit more [involved](https://gitlab.lis-lab.fr/cluster/wiki/-/wikis/Compiling%20bitsandbytes%20for%20int8%20inference).
See RESULTS for the exact match results on the dev.
See runs.sh for how to generate runs.
Note that external APIs require API keys. Please rename api_keys.template.py to api_keys.py and set keys you need inside.
......@@ -14,12 +14,20 @@ lm_templates_en = [
letters = 'abcdefghijklmnopqrstuvwxyz'
def linearize_instance(instance, include_correct_answers=False):
def linearize_instance(instance, include_correct_answers=False, add_left_parenthesis=False):
result = instance['question'] + '\n' + '\n'.join('(%s) %s.' % (k, v) for k, v in instance['answers'].items())
if include_correct_answers:
result += '\nRéponse(s) : ' + ' '.join('(%s)' % a for a in instance['correct_answers'])
else:
result += '\nRéponse(s) :' + (' (' if add_left_parenthesis else '')
return result
#def linearize_instance(instance, include_correct_answers=False):
# result = instance['question'] + '\n' + '\n'.join('(%s) %s.' % (k, v) for k, v in instance['answers'].items())
# if include_correct_answers:
# result += '\nRéponse(s) : ' + ' '.join('(%s)' % a for a in instance['correct_answers'])
# return result
def get_prompt(prompt, instance, few_shots=[]):
shots = [linearize_instance(shot, include_correct_answers=True) for shot in few_shots]
return prompt % ('\n\n'.join(shots + [linearize_instance(instance)]),)
......@@ -33,10 +41,17 @@ def extract_answer(answer, num_answers=5):
selected = [x.replace(')', '').replace('(', '') for x in selected]
return list(sorted(set([letter.lower() for letter in selected])))
def hamming(a, b, num):
A = [c.upper() if c in a else c for c in letters[:num]]
B = [c.upper() if c in b else c for c in letters[:num]]
return [x == y for x, y in zip(A, B)].count(True)
#def hamming(a, b, num):
# A = [c.upper() if c in a else c for c in letters[:num]]
# B = [c.upper() if c in b else c for c in letters[:num]]
# return [x == y for x, y in zip(A, B)].count(True)
def hamming(preds, refs):
corrects = [True for p in preds if p in refs]
corrects = sum(corrects)
total_refs = len(list(set(preds + refs)))
return corrects / total_refs
def run_inference(generator, corpus_path, template):
with open(corpus_path) as fp:
......@@ -56,7 +71,7 @@ def run_inference(generator, corpus_path, template):
print(answer, instance['correct_answers'])
if set(answer) == set(instance['correct_answers']):
num_exact_correct += 1
num_hamming_correct += hamming(answer, instance['correct_answers'], len(instance['answers']))
num_hamming_correct += hamming(answer, instance['correct_answers'])
num_hamming += len(instance['answers'])
results.append(instance['id'] + ';' + '|'.join(list(sorted(answer))))
......
# adapted from https://github.com/tloen/alpaca-lora/
import os
import uuid
import json
......@@ -7,120 +8,126 @@ import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
LLAMA_VARIANT = 65
from deft import linearize_instance
# reasonable batch size with 80GB GPU
# 7B -> 24
# 13B -> 12
# 30B -> 6
# 65B -> 1
MICRO_BATCH_SIZE = 1
BATCH_SIZE = 24
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1 # from the result
LEARNING_RATE = 3e-4 # the karpathy constant
CUTOFF_LEN = 256
WARMUP_RATIO = 0.05
IS_INT8 = True
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
PROMPT = '''Ceci est une question de QCM de l\'examen de pharmacie. Réponds avec la ou les lettres correspondant à la bonne réponse.\n\n%s'''
def main(LLAMA_VARIANT : int = 65,
MICRO_BATCH_SIZE : int = 1,
BATCH_SIZE : int = 24,
EPOCHS : int = 1,
LEARNING_RATE : float = 3e-4,
CUTOFF_LEN : int = 256,
WARMUP_RATIO : float = 0.05,
IS_INT8 : bool = True,
LORA_R : int = 4,
LORA_ALPHA : int = 16,
LORA_DROPOUT : int = 0.05,
valid_steps : int = 20,
PROMPT : str = '''Ceci est une question de QCM de l\'examen de pharmacie. Réponds avec la ou les lettres correspondant à la bonne réponse.\n\n%s''',
output_dir : str = 'deft_models',
train_json : str = '../../json/train.json',
dev_json : str = '../../json/dev.json',
):
#PROMPT = '''Ceci est une question de QCM de l\'examen de pharmacie. Réponds avec la ou les lettres correspondant à la bonne réponse.\n\n## Question 1\n\n%s'''
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
BACKBONE = "decapoda-research/llama-%db-hf" % LLAMA_VARIANT
OUTPUT_PATH = "%s/deft_%s_lora_%s" % (output_dir, BACKBONE.split('/')[-1], str(uuid.uuid4()))
print(OUTPUT_PATH)
os.makedirs(OUTPUT_PATH, exist_ok=True)
model = LlamaForCausalLM.from_pretrained(
BACKBONE,
load_in_8bit=IS_INT8,
device_map="auto",
)
tokenizer = LlamaTokenizer.from_pretrained(BACKBONE, add_eos_token=True)
if IS_INT8:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules=["q_proj", "v_proj"],
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
data = load_dataset("json", data_files="../../json/train.json")
data_valid = load_dataset("json", data_files="../../json/dev.json")
data['validation'] = data_valid['train']
def generate_prompt(data_point):
return PROMPT % linearize_instance(data_point, include_correct_answers=True)
print(generate_prompt(data['train'][0]))
data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
max_length=CUTOFF_LEN,
padding="max_length",
)
)
BACKBONE = "decapoda-research/llama-%db-hf" % LLAMA_VARIANT
OUTPUT_PATH = "deft_models/deft_%s_lora_%s" % (BACKBONE.split('/')[-1], str(uuid.uuid4()))
print(OUTPUT_PATH)
os.makedirs(OUTPUT_PATH, exist_ok=True)
model = LlamaForCausalLM.from_pretrained(
BACKBONE,
load_in_8bit=IS_INT8,
device_map="auto",
)
tokenizer = LlamaTokenizer.from_pretrained(BACKBONE, add_eos_token=True)
if IS_INT8:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules=["q_proj", "v_proj"],
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
data = load_dataset("json", data_files="../../json/train.json")
data_valid = load_dataset("json", data_files="../../json/dev.json")
data['validation'] = data_valid['train']
def linearize_instance(instance, include_correct_answers=False):
result = instance['question'] + '\n' + '\n'.join('(%s) %s.' % (k, v) for k, v in instance['answers'].items())
if include_correct_answers:
result += '\nRéponse(s) : ' + '; '.join('(%s) %s' % (a, instance['answers'][a]) for a in instance['correct_answers']) + '.\n'
else:
result += '\nRéponse(s) : ('
return result
def generate_prompt(data_point):
return PROMPT % linearize_instance(data_point, include_correct_answers=True)
print(generate_prompt(data['train'][0]))
data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
max_length=CUTOFF_LEN,
padding="max_length",
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
eval_dataset=data["validation"],
args=transformers.TrainingArguments(
do_eval=True,
evaluation_strategy='steps',
eval_steps=valid_steps,
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_ratio=WARMUP_RATIO,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
logging_steps=1,
output_dir=OUTPUT_PATH,
save_total_limit=3,
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
)
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
eval_dataset=data["validation"],
args=transformers.TrainingArguments(
do_eval=True,
evaluation_strategy='steps',
eval_steps=20,
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_ratio=WARMUP_RATIO,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
logging_steps=1,
output_dir=OUTPUT_PATH,
save_total_limit=3,
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained(OUTPUT_PATH)
with open("%s/training_config.json" % OUTPUT_PATH, 'w') as fp:
fp.write(json.dumps({
'backbone': BACKBONE,
'micro_batch_size': MICRO_BATCH_SIZE,
'batch_size': BATCH_SIZE,
'epochs': EPOCHS,
'learning_rate': LEARNING_RATE,
'cutoff_len': CUTOFF_LEN,
'warmup_ratio': WARMUP_RATIO,
'llama_variant': LLAMA_VARIANT,
'is_int8': IS_INT8,
'lora_r': LORA_R,
'lora_alpha': LORA_ALPHA,
'lora_dropout': LORA_DROPOUT,
'prompt': PROMPT,
'output_path': OUTPUT_PATH
}, indent=4))
print(OUTPUT_PATH)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained(OUTPUT_PATH)
with open("%s/training_config.json" % OUTPUT_PATH, 'w') as fp:
fp.write(json.dumps({
'backbone': BACKBONE,
'micro_batch_size': MICRO_BATCH_SIZE,
'batch_size': BATCH_SIZE,
'epochs': EPOCHS,
'learning_rate': LEARNING_RATE,
'cutoff_len': CUTOFF_LEN,
'warmup_ratio': WARMUP_RATIO,
'llama_variant': LLAMA_VARIANT,
'is_int8': IS_INT8,
'lora_r': LORA_R,
'lora_alpha': LORA_ALPHA,
'lora_dropout': LORA_DROPOUT,
'prompt': PROMPT,
'output_path': OUTPUT_PATH
}, indent=4))
print(OUTPUT_PATH)
if __name__ == '__main__':
import fire
fire.Fire(main)
from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
import transformers
import torch
import sys
import json
from argparse import Namespace
from deft import linearize_instance, extract_answer, hamming
# stop inference when a given token was generated (for example '\n')
class TokenStopper(transformers.StoppingCriteriaList):
def __init__(self, token, prompt_lengths):
self.token = tokenizer.encode(token)[-1]
self.prompt_lengths = prompt_lengths
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for sequence, length in zip(input_ids, self.prompt_lengths):
sequence = sequence[length:]
if (sequence == self.token).sum() == 0:
return False
else:
return True
def run_batch(batch, eval_stats, output_fp):
prompts = [config.prompt % linearize_instance(instance, add_left_parenthesis=True) for instance in batch]
inputs = tokenizer(
prompts,
return_tensors="pt",
padding=True,
).to('cuda')
lengths = [len(tokenizer.encode(prompt)) for prompt in prompts]
generation_config = GenerationConfig(
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
)
with torch.no_grad():
generation_output = model.generate(
**inputs, max_new_tokens=128,
generation_config=generation_config,
#stopping_criteria=[TokenStopper('\n', [len(sequence) for sequence in inputs['input_ids']])]
)
for instance, PROMPT, length, sequence in zip(batch, prompts, lengths, generation_output):
generated = tokenizer.decode(sequence, skip_special_tokens=True)
print(generated)
answer = extract_answer(generated[len(PROMPT):].split('\n')[0])
print(answer, instance['correct_answers'])
output_fp.write(instance['id'] + ';' + '|'.join(sorted(answer)).lower() + '\n')
eval_stats['num_emr'] += 1
if set(answer) == set(instance['correct_answers']):
eval_stats['num_emr_correct'] += 1
eval_stats['num_hamming_sum'] += hamming(answer, instance['correct_answers'])
eval_stats['num_hamming'] += len(instance['answers'])
def main(model_path : str, output_path : str, corpus_path : str = '../../json/dev.json', adapted : bool = True, batch_size : int = 1):
global config
import json
with open('%s/training_config.json' % model_path) as fp:
config = Namespace(**json.loads(fp.read()))
#config.prompt = "This is a MCQ from the biology exam in French. Answer with the correct set of letters.\n\n%s"
global tokenizer
tokenizer = LlamaTokenizer.from_pretrained(config.backbone, padding_side='left')
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
global model
model = LlamaForCausalLM.from_pretrained(
config.backbone,
load_in_8bit=config.is_int8,
torch_dtype=torch.float16,
device_map="auto",
)
if adapted:
# device_map={"": 0} fix device = None error https://github.com/huggingface/peft/issues/115
model = PeftModel.from_pretrained(model, model_path, torch_dtype=torch.float16, device_map={"": 0})
with open(corpus_path) as fp:
corpus = json.loads(fp.read())
eval_stats = {
'num_emr': 0,
'num_emr_correct': 0,
'num_hamming_sum': 0,
'num_hamming': 0,
}
with open(output_path, 'w') as output_fp:
for i in range(0, len(corpus), batch_size):
run_batch(corpus[i: i + batch_size], eval_stats, output_fp)
print('EXACT MATCH:', config.output_path, eval_stats['num_emr_correct'] / eval_stats['num_emr'])
print('HAMMING DIST:', config.output_path, eval_stats['num_hamming_sum'] / eval_stats['num_hamming'])
if __name__ == '__main__':
import fire
fire.Fire(main)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment