first set of scripts for using llama LM

bec285c0 · Benoit Favre · 3f6d6764 · bec285c0 · bec285c0 · bec285c0
Commit bec285c0 authored 2 years ago by Benoit Favre
--- a/README.md
+++ b/README.md
@@ -6,7 +6,11 @@ Install:
 pip install -r requirements.txt
 ```
+Installing bitsandbytes for llama models is a bit more [involved](https://gitlab.lis-lab.fr/cluster/wiki/-/wikis/Compiling%20bitsandbytes%20for%20int8%20inference).
 See RESULTS for the exact match results on the dev.
 See runs.sh for how to generate runs.
 Note that external APIs require API keys. Please rename api_keys.template.py to api_keys.py and set keys you need inside.
--- a/deft.py
+++ b/deft.py
@@ -14,12 +14,20 @@ lm_templates_en = [
 letters = 'abcdefghijklmnopqrstuvwxyz'
-def linearize_instance(instance, include_correct_answers=False):
+def linearize_instance(instance, include_correct_answers=False, add_left_parenthesis=False):
    result = instance['question'] + '\n' + '\n'.join('(%s) %s.' % (k, v) for k, v in instance['answers'].items())
    if include_correct_answers:
        result += '\nRéponse(s) : ' + ' '.join('(%s)' % a for a in instance['correct_answers'])
+    else:
+        result += '\nRéponse(s) :' + (' (' if add_left_parenthesis else '')
    return result
+#def linearize_instance(instance, include_correct_answers=False):
+#    result = instance['question'] + '\n' + '\n'.join('(%s) %s.' % (k, v) for k, v in instance['answers'].items())
+#    if include_correct_answers:
+#        result += '\nRéponse(s) : ' + ' '.join('(%s)' % a for a in instance['correct_answers'])
+#    return result
 def get_prompt(prompt, instance, few_shots=[]):
    shots = [linearize_instance(shot, include_correct_answers=True) for shot in few_shots]
    return prompt % ('\n\n'.join(shots + [linearize_instance(instance)]),)
@@ -33,10 +41,17 @@ def extract_answer(answer, num_answers=5):
        selected = [x.replace(')', '').replace('(', '') for x in selected]
    return list(sorted(set([letter.lower() for letter in selected])))
-def hamming(a, b, num):
+#def hamming(a, b, num):
-    A = [c.upper() if c in a else c for c in letters[:num]]
+#    A = [c.upper() if c in a else c for c in letters[:num]]
-    B = [c.upper() if c in b else c for c in letters[:num]]
+#    B = [c.upper() if c in b else c for c in letters[:num]]
-    return [x == y for x, y in zip(A, B)].count(True)
+#    return [x == y for x, y in zip(A, B)].count(True)
+def hamming(preds, refs):
+    corrects = [True for p in preds if p in refs]
+    corrects = sum(corrects)
+    total_refs = len(list(set(preds + refs)))
+    return corrects / total_refs
 def run_inference(generator, corpus_path, template):
    with open(corpus_path) as fp:
@@ -56,7 +71,7 @@ def run_inference(generator, corpus_path, template):
        print(answer, instance['correct_answers'])
        if set(answer) == set(instance['correct_answers']):
            num_exact_correct += 1
-        num_hamming_correct += hamming(answer, instance['correct_answers'], len(instance['answers']))
+        num_hamming_correct += hamming(answer, instance['correct_answers'])
        num_hamming += len(instance['answers'])
        results.append(instance['id'] + ';' + '|'.join(list(sorted(answer))))

--- a/finetune_llama.py
+++ b/finetune_llama.py
+# adapted from https://github.com/tloen/alpaca-lora/
 import os
 import uuid
 import json
@@ -7,28 +8,38 @@ import transformers
 from transformers import LlamaForCausalLM, LlamaTokenizer
 from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
-LLAMA_VARIANT = 65
+from deft import linearize_instance
+# reasonable batch size with 80GB GPU
 # 7B -> 24
 # 13B -> 12
 # 30B -> 6
 # 65B -> 1
-MICRO_BATCH_SIZE = 1
-BATCH_SIZE = 24
+def main(LLAMA_VARIANT : int = 65,
-GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
+    MICRO_BATCH_SIZE : int = 1,
-EPOCHS = 1  # from the result
+    BATCH_SIZE : int = 24,
-LEARNING_RATE = 3e-4  # the karpathy constant
+    EPOCHS : int = 1,
-CUTOFF_LEN = 256
+    LEARNING_RATE : float = 3e-4,
-WARMUP_RATIO = 0.05
+    CUTOFF_LEN : int = 256,
-IS_INT8 = True
+    WARMUP_RATIO : float = 0.05,
-LORA_R = 4
+    IS_INT8 : bool = True,
-LORA_ALPHA = 16
+    LORA_R : int = 4,
-LORA_DROPOUT = 0.05
+    LORA_ALPHA : int = 16,
-PROMPT = '''Ceci est une question de QCM de l\'examen de pharmacie. Réponds avec la ou les lettres correspondant à la bonne réponse.\n\n%s'''
+    LORA_DROPOUT : int = 0.05,
+    valid_steps : int = 20,
+    PROMPT : str = '''Ceci est une question de QCM de l\'examen de pharmacie. Réponds avec la ou les lettres correspondant à la bonne réponse.\n\n%s''',
+    output_dir : str = 'deft_models',
+    train_json : str = '../../json/train.json',
+    dev_json : str = '../../json/dev.json',
+    ):
 #PROMPT = '''Ceci est une question de QCM de l\'examen de pharmacie. Réponds avec la ou les lettres correspondant à la bonne réponse.\n\n## Question 1\n\n%s'''
+    GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
    BACKBONE = "decapoda-research/llama-%db-hf" % LLAMA_VARIANT
-OUTPUT_PATH = "deft_models/deft_%s_lora_%s" % (BACKBONE.split('/')[-1], str(uuid.uuid4()))
+    OUTPUT_PATH = "%s/deft_%s_lora_%s" % (output_dir, BACKBONE.split('/')[-1], str(uuid.uuid4()))
    print(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH, exist_ok=True)
@@ -58,14 +69,6 @@ data = load_dataset("json", data_files="../../json/train.json")
    data_valid = load_dataset("json", data_files="../../json/dev.json")
    data['validation'] = data_valid['train']
-def linearize_instance(instance, include_correct_answers=False):
-    result = instance['question'] + '\n' + '\n'.join('(%s) %s.' % (k, v) for k, v in instance['answers'].items())
-    if include_correct_answers:
-        result += '\nRéponse(s) : ' + '; '.join('(%s) %s' % (a, instance['answers'][a]) for a in instance['correct_answers']) + '.\n'
-    else:
-        result += '\nRéponse(s) : ('
-    return result
    def generate_prompt(data_point):
        return PROMPT % linearize_instance(data_point, include_correct_answers=True)
@@ -87,7 +90,7 @@ trainer = transformers.Trainer(
        args=transformers.TrainingArguments(
            do_eval=True,
            evaluation_strategy='steps',
-        eval_steps=20,
+            eval_steps=valid_steps,
            per_device_train_batch_size=MICRO_BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            warmup_ratio=WARMUP_RATIO,
@@ -124,3 +127,7 @@ with open("%s/training_config.json" % OUTPUT_PATH, 'w') as fp:
        }, indent=4))
    print(OUTPUT_PATH)
+if __name__ == '__main__':
+    import fire
+    fire.Fire(main)
--- a/run_llama.py
+++ b/run_llama.py
+from peft import PeftModel
+from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
+import transformers
+import torch
+import sys
+import json
+from argparse import Namespace
+from deft import linearize_instance, extract_answer, hamming
+# stop inference when a given token was generated (for example '\n')
+class TokenStopper(transformers.StoppingCriteriaList):
+    def __init__(self, token, prompt_lengths):
+        self.token = tokenizer.encode(token)[-1]
+        self.prompt_lengths = prompt_lengths
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        for sequence, length in zip(input_ids, self.prompt_lengths):
+            sequence = sequence[length:]
+            if (sequence == self.token).sum() == 0:
+                return False
+        else:
+            return True
+def run_batch(batch, eval_stats, output_fp):
+    prompts = [config.prompt % linearize_instance(instance, add_left_parenthesis=True) for instance in batch]
+    inputs = tokenizer(
+        prompts,
+        return_tensors="pt",
+        padding=True,
+    ).to('cuda')
+    lengths = [len(tokenizer.encode(prompt)) for prompt in prompts]
+    generation_config = GenerationConfig(
+        temperature=0.1,
+        top_p=0.75,
+        top_k=40,
+        num_beams=4,
+    )
+    with torch.no_grad():
+        generation_output = model.generate(
+            **inputs, max_new_tokens=128, 
+            generation_config=generation_config, 
+            #stopping_criteria=[TokenStopper('\n', [len(sequence) for sequence in inputs['input_ids']])]
+        )
+    for instance, PROMPT, length, sequence in zip(batch, prompts, lengths, generation_output):
+        generated = tokenizer.decode(sequence, skip_special_tokens=True)
+        print(generated)
+        answer = extract_answer(generated[len(PROMPT):].split('\n')[0])
+        print(answer, instance['correct_answers'])
+        output_fp.write(instance['id'] + ';' + '|'.join(sorted(answer)).lower() + '\n')
+        eval_stats['num_emr'] += 1
+        if set(answer) == set(instance['correct_answers']):
+            eval_stats['num_emr_correct'] += 1
+        eval_stats['num_hamming_sum'] += hamming(answer, instance['correct_answers'])
+        eval_stats['num_hamming'] += len(instance['answers'])
+def main(model_path : str, output_path : str, corpus_path : str = '../../json/dev.json', adapted : bool = True, batch_size : int = 1):
+    global config
+    import json
+    with open('%s/training_config.json' % model_path) as fp:
+        config = Namespace(**json.loads(fp.read()))
+    #config.prompt = "This is a MCQ from the biology exam in French. Answer with the correct set of letters.\n\n%s"
+    global tokenizer
+    tokenizer = LlamaTokenizer.from_pretrained(config.backbone, padding_side='left')
+    tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
+    global model
+    model = LlamaForCausalLM.from_pretrained(
+        config.backbone,
+        load_in_8bit=config.is_int8,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    if adapted:
+        # device_map={"": 0} fix device = None error https://github.com/huggingface/peft/issues/115
+        model = PeftModel.from_pretrained(model, model_path, torch_dtype=torch.float16, device_map={"": 0}) 
+    with open(corpus_path) as fp:
+        corpus = json.loads(fp.read())
+    eval_stats = {
+        'num_emr': 0,
+        'num_emr_correct': 0,
+        'num_hamming_sum': 0,
+        'num_hamming': 0,
+    }
+    with open(output_path, 'w') as output_fp:
+        for i in range(0, len(corpus), batch_size):
+            run_batch(corpus[i: i + batch_size], eval_stats, output_fp)
+    print('EXACT MATCH:', config.output_path, eval_stats['num_emr_correct'] / eval_stats['num_emr'])
+    print('HAMMING DIST:', config.output_path, eval_stats['num_hamming_sum'] / eval_stats['num_hamming'])
+if __name__ == '__main__':
+    import fire
+    fire.Fire(main)