Merge branch 'master' into 'main'

Initial See merge request !1

Merge branch 'master' into 'main'
973461f0 · LE Quy thanh · 0eb1d645 · 32a29539 · 973461f0 · 973461f0
Commit 973461f0 authored 3 months ago by LE Quy thanh
--- a/fine_tuning.py
+++ b/fine_tuning.py
+
+
+
+from huggingface_hub import login
+from datasets import Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import torch
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
+from transformers import TrainingArguments
+from trl import SFTTrainer
+import pandas as pd
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from datetime import datetime
+import gc
+import os.path
+import statistics
+import csv
+from collections import Counter
+import random
+import numpy as np
+import argparse
+import re
+from unsloth import FastLanguageModel
+import json
+with open("origin_train_data.json") as f:
+  train_data = json.load(f)
+
+model_id = "unsloth/DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit"
+
+put hungging face token here
+
+hf_token = ""
+login(hf_token)
+
+
+
+dtype = None 
+load_in_4bit = True
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = model_id,
+    max_seq_length = 8196,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+    token = hf_token, 
+)
+
+
+def get_features(data):
+  samples = []
+  system = """
+You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
+
+Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
+
+- Respond "True" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
+- Respond "False" if:
+  - The relevant evidence contradicts or disproves the claim.
+  - The claim is misleading based on the relevant evidence.
+  - The evidence is too weak or insufficient to support the claim.
+- Respond "Half True/False" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
+
+Always adhere to the following rules:
+
+- Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
+- Avoid repeating yourself.
+"""
+  prompt = "Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label."
+  for index, fact in enumerate(data):
+    sample = {}
+    sample["response"] = "Verdict: " + fact["label"]
+    claim = fact["claim"]
+    evidences = []
+    questions = []
+    fact["input_str"] = prompt + "\n\nClaim: " + claim + "\n\n" + "\n\n".join([
+    f"Q{i+1}: {qa['questions']}\nA{i+1}: {qa['top_k_doc'][0] if qa['top_k_doc'] else 'No answer could be found.'}"
+    for i, qa in enumerate(fact["evidences"])
+])
+    sample["system"] = system
+# "No answer could be found."
+    for question in fact["evidences"]:
+      if len(question["top_k_doc"])>0:
+        evidences.append(question["top_k_doc"][0])
+      questions.append(question["questions"])
+        
+    sample["prompt"] = fact["input_str"]
+    samples.append(sample)
+  return samples
+
+
+train_features = get_features(train_data)
+train_features = pd.DataFrame(train_features)
+train_data = Dataset.from_pandas(train_features)
+
+
+def format_chat_template(example):
+    
+    example_json = [    
+      {"role": "system", "content": example["system"],},
+      {"role": "user", "content": example["prompt"]},
+      {"role": "assistant", "content": example["response"]}]
+    
+    example["text"] = tokenizer.apply_chat_template(example_json, tokenize=False)
+    return example
+
+train_data = train_data.map(format_chat_template, num_proc= 4)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=64,
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ],
+    lora_alpha=16,
+    lora_dropout=0.1,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+    use_rslora=False,
+)
+
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from unsloth import is_bfloat16_supported
+
+
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=train_data,
+    dataset_text_field="text",
+    max_seq_length=8196,
+    dataset_num_proc=2,
+    args=TrainingArguments(
+        per_device_train_batch_size=2,
+        warmup_steps=5,
+        num_train_epochs=2, 
+        learning_rate=2e-4,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        logging_steps=10,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        lr_scheduler_type="linear",
+        seed=3407,
+        output_dir="outputs",
+    ),
+)
+
+import os
+from datetime import datetime
+from pathlib import Path
+import shutil
+import os
+
+model_id_clean = re.sub(r'[\\/#!?%:]', '_', model_id)  
+saved_model_name = "model"
+
+current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+
+trainer_stats = trainer.train()
+
+model.save_pretrained(f"model_{model_id_clean}") # Local saving
+tokenizer.save_pretrained(f"model_{model_id_clean}")
+print(f"Results saved to model_{model_id_clean}")
\ No newline at end of file
--- a/log_unsloth_DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit.csv
+++ b/log_unsloth_DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit.csv
+system,prompt,label,claim,response,prediction
+"
+    You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
+
+    Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
+
+    - Respond ""True"" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
+    - Respond ""False"" if:
+    - The relevant evidence contradicts or disproves the claim.
+    - The claim is misleading based on the relevant evidence.
+    - The evidence is too weak or insufficient to support the claim.
+    - Respond ""Half True/False"" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
+
+    Always adhere to the following rules:
+
+    - Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
+    - Avoid repeating yourself.
+    ","<｜begin▁of▁sentence｜>
+    You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
+
+    Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
+
+    - Respond ""True"" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
+    - Respond ""False"" if:
+    - The relevant evidence contradicts or disproves the claim.
+    - The claim is misleading based on the relevant evidence.
+    - The evidence is too weak or insufficient to support the claim.
+    - Respond ""Half True/False"" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
+
+    Always adhere to the following rules:
+
+    - Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
+    - Avoid repeating yourself.
+    <｜User｜>Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label.
+
+Claim: Says David Perdue wants to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans.
+
+Q1: does david perdue want to abolish the u.s. department of education?
+
+A1: last week, kentucky rep. thomas massie introduced a bill to abolish the education department, and it's a short, one-sentence read: ""the department of education shall terminate on december 31, 2023.""
+
+Q2:  would abolishing the u.s. department of education cost georgia schools $1 billion a year?
+
+A2: 27 juil. 2022  georgia's fy 2022 education budget sends $10.2 billion to k-12 public schools, maintaining a budget cut of $383 million.
+
+Q3:  would abolishing the u.s. department of education eliminate millions in student loans?
+A3: in student loans would be at risk; 23.9 million* low-income students abolishment of the u.s. department of education. at the time, he and<｜Assistant｜><think>
+",Conflicting,"Says David Perdue wants to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans.","<｜begin▁of▁sentence｜><｜begin▁of▁sentence｜>
+    You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
+
+    Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
+
+    - Respond ""True"" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
+    - Respond ""False"" if:
+    - The relevant evidence contradicts or disproves the claim.
+    - The claim is misleading based on the relevant evidence.
+    - The evidence is too weak or insufficient to support the claim.
+    - Respond ""Half True/False"" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
+
+    Always adhere to the following rules:
+
+    - Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
+    - Avoid repeating yourself.
+    <｜User｜>Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label.
+
+Claim: Says David Perdue wants to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans.
+
+Q1: does david perdue want to abolish the u.s. department of education?
+
+A1: last week, kentucky rep. thomas massie introduced a bill to abolish the education department, and it's a short, one-sentence read: ""the department of education shall terminate on december 31, 2023.""
+
+Q2:  would abolishing the u.s. department of education cost georgia schools $1 billion a year?
+
+A2: 27 juil. 2022  georgia's fy 2022 education budget sends $10.2 billion to k-12 public schools, maintaining a budget cut of $383 million.
+
+Q3:  would abolishing the u.s. department of education eliminate millions in student loans?
+A3: in student loans would be at risk; 23.9 million* low-income students abolishment of the u.s. department of education. at the time, he and<｜Assistant｜><think>
+Okay, so I need to figure out whether the claim about David Perdue wanting to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans, is True, False, or Half True/False based on the provided Q&A evidence.
+
+First, let's break down the claim into parts. The claim has three main points:
+
+1. David Perdue wants to abolish the U.S. Department of Education.
+2. Abolishing the Department would cost Georgia schools $1 billion a year.
+3. It would eliminate millions in student loans.
+
+Now, looking at the evidence provided:
+
+Q1 asks if David Perdue wants to abolish the Department. The answer says that Kentucky Rep. Thomas Massie introduced a bill to abolish it, not David Perdue. So, the evidence doesn't mention Perdue wanting to do this. That's a problem because the claim attributes this desire to Perdue, but the evidence points to someone else.
+
+Q2 asks if abolishing the Department would cost Georgia $1 billion. The answer mentions Georgia's FY 2022 budget sent $10.2 billion to K-12 schools but doesn't specify how much comes from the federal Department of Education. It also mentions a budget cut of $383 million, but that's a state-level cut, not necessarily from abolishing the Department. So, the evidence doesn't directly support the $1 billion figure.
+
+Q3 asks about student loans. The answer says that 23.9 million low-income students would be at risk if the Department is abolished. However, it doesn't specify whether this would eliminate millions in loans or how exactly the loans would be affected. It's a bit vague on the impact.
+
+Putting it all together, the evidence doesn't support the first part of the claim about Perdue wanting to abolish the Department. It also doesn't provide clear evidence that Georgia would lose $1 billion or that millions in loans would be eliminated. The information is either about someone else (Massie) or not specific enough about the financial impacts.
+
+Therefore, the claim isn't fully supported by the evidence. It's misleading because it attributes the desire to Perdue when the evidence points to Massie, and it doesn't back up the financial figures. So, the verdict should be False.
+</think>
+
+**Verdict: False**
+
+**Step-by-Step Explanation:**
+
+1. **Claim Analysis:** The claim attributes three actions to David Perdue: wanting to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion annually and eliminate millions in student loans.
+
+2. **Evidence Review:**
+   - **Q1/A1:** Indicates Rep. Thomas Massie introduced the bill, not David Perdue.
+   - **Q2/A2:** Georgia's budget details don't specify federal contributions or a $1 billion loss.
+   - **Q3/A3:** Vague on the impact of abolishing the Department on student loans.
+
+3. **Conclusion:** The evidence contradicts the claim regarding Perdue's involvement and lacks support for the financial impacts stated. Thus, the claim is misleading and unsupported.
+
+**Final Answer:** False<｜end▁of▁sentence｜>",False
--- a/origin_train_data.json
+++ b/origin_train_data.json
--- a/result_metrics_confusion_matrix.png
+++ b/result_metrics_confusion_matrix.png
--- a/result_metrics_temp_score.csv
+++ b/result_metrics_temp_score.csv
+Weighted F1 Score,Macro F1 Score,Accuracy
+0.0,0.0,0.0
--- a/test.py
+++ b/test.py
+
+
+
+
+
+from huggingface_hub import login
+
+import pandas as pd
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from datetime import datetime
+import csv
+from collections import Counter
+import random
+import numpy as np
+import argparse
+import re
+import json
+
+model_id = "unsloth_DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit"
+
+put hunging face token here
+hf_token = ""
+login(hf_token)
+
+
+
+from unsloth import FastLanguageModel
+
+
+
+
+dtype = None 
+load_in_4bit = True
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = model_id,
+    max_seq_length = 8196,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+    token = hf_token, 
+)
+
+
+
+with open("test_claimdecomp_evidence_question_mapping_origin.json") as f:
+  test_data = json.load(f)
+
+def get_features_test(data):
+    samples = []
+    prompt = "Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label."
+    
+    system = """
+    You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
+
+    Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
+
+    - Respond "True" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
+    - Respond "False" if:
+    - The relevant evidence contradicts or disproves the claim.
+    - The claim is misleading based on the relevant evidence.
+    - The evidence is too weak or insufficient to support the claim.
+    - Respond "Half True/False" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
+
+    Always adhere to the following rules:
+
+    - Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
+    - Avoid repeating yourself.
+    """
+    
+    for fact in data:  # Giả sử bạn có một danh sách dữ liệu trong `data`
+        claim = fact.get("claim", "")  # Lấy claim từ mỗi fact
+        seen_docs = set()  # Tạo tập hợp để lưu các câu trả lời đã xuất hiện
+        filtered_qa = []   # Danh sách mới chứa các Q/A hợp lệ
+        for qa in fact.get("evidences", []):
+            top_doc = qa['top_k_doc'][0] if qa['top_k_doc'] else ''
+            
+            if top_doc and top_doc in seen_docs:
+                # print(top_doc)
+
+                continue  # Bỏ qua Q và A nếu top_k_doc[0] đã xuất hiện
+            
+            seen_docs.add(top_doc)  # Đánh dấu tài liệu đã xuất hiện
+            filtered_qa.append(qa)  # Thêm vào danh sách hợp lệ
+
+        fact["input_str"] = (
+            prompt + "\n\nClaim: " + claim + "\n\n" + "\n\n".join([
+                f"Q{i+1}: {qa['questions']}\nA{i+1}: {qa['top_k_doc'][0] if qa['top_k_doc'] else 'No answer could be found.'}"
+                for i, qa in enumerate(filtered_qa)
+            ])
+        )
+        # print(fact["input_str"])
+        
+        sample = {}  # Khởi tạo sample nếu chưa có
+        sample["system"] = system
+        sample["prompt"] = fact["input_str"]
+        sample["label"] = fact["label"]
+        sample["claim"] = fact["claim"]
+        samples.append(sample)
+    
+    return samples
+
+test_features = get_features_test(test_data)
+test_features = pd.DataFrame(test_features)
+# test_features = test_features[35:36]
+
+def generate_response(sample, model):
+
+    inputs = tokenizer([sample['prompt']], return_tensors='pt').to("cuda")
+
+
+    outputs = model.generate(
+    input_ids=inputs.input_ids,
+    attention_mask=inputs.attention_mask,
+    max_new_tokens=3000,
+    use_cache=False, temperature= 0.1
+    )
+
+    response = tokenizer.batch_decode(outputs)
+    sample['response'] = response[0]
+    if "Verdict:" in response[0]:
+        response = response[0].split("Verdict:")[1]
+    else:
+        print("Không tìm thấy 'Verdict:' trong response.")
+        print(response[0])
+        response = ""  # Hoặc xử lý tùy ý
+
+    if "Half True/False" in response:
+        sample["prediction"] = "Conflicting"
+    elif "False" in response:
+        sample["prediction"] = "False"
+    elif "True" in response:
+        sample["prediction"] = "True"
+    else:
+        print("Error: ", response)
+        sample["prediction"] = "Conflicting"
+        # exit(0)
+    return sample
+
+def format_chat_template_test(example):
+    
+    example_json = [
+        {"role": "system", "content": example["system"]},
+         {"role": "user", "content": example["prompt"]}
+                    ]
+
+    
+    example["prompt"] = tokenizer.apply_chat_template(example_json, tokenize=False, add_generation_prompt=True)
+    return example
+
+test_data = test_features.apply(format_chat_template_test, axis=1)
+
+FastLanguageModel.for_inference(model)
+from tqdm import tqdm
+
+tqdm.pandas()  # Khởi tạo tqdm cho pandas
+
+test_data = test_data.progress_apply(lambda sample: generate_response(sample, model), axis=1)
+
+
+import os
+from datetime import datetime
+from pathlib import Path
+
+model_id_clean = re.sub(r'[\\/#!?%:]', '_', model_id)  # Xử lý trước
+saved_model_name = "model"
+
+# Lấy thời gian hiện tại và định dạng thành chuỗi
+current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+
+log_data = test_data
+
+# Chuyển thành DataFrame
+df = pd.DataFrame(test_data)
+
+# Lưu DataFrame vào CSV
+output_file = f"log_{model_id_clean}.csv"
+df.to_csv(output_file, index=False)
+
+print(f"log saved to {output_file}")
+
+
+actual = test_data['label']
+prediction = test_data['prediction']
+
+from sklearn.metrics import confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+
+def run_metrics( actual, prediction):
+    """Run the metrics based on the loaded config."""
+    print("Running metrics with the following settings:")
+
+    # Calculate metrics
+    accuracy = accuracy_score(actual, prediction)
+    m_f1 = f1_score(actual, prediction, average='macro')  # Use 'macro' for multi-class
+    w_f1 = f1_score(actual, prediction, average='weighted') 
+    metrics = {
+        "W-F1 Score": w_f1,
+        "M-F1 Score": m_f1,
+        "Accuracy": accuracy
+    }
+    cm = confusion_matrix(actual, prediction)
+
+    labels = ["Conflicting", "False", "True"]
+
+    # Plot confusion matrix
+    plt.figure(figsize=(6,5))
+    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=labels, yticklabels=labels)
+    plt.xlabel("Predicted Label")
+    plt.ylabel("True Label")
+    plt.title("Confusion Matrix")
+    plt.savefig(f"result_metrics_confusion_matrix.png")
+    plt.show()
+
+    # Check each metric and print if it's enabled in config
+    result_data = {}
+    print(metrics)
+    # Check each metric and store if it's enabled in config
+  
+    result_data['Weighted F1 Score'] = metrics['W-F1 Score']
+
+    result_data['Macro F1 Score'] = metrics['M-F1 Score']
+
+    result_data['Accuracy'] = metrics['Accuracy']
+
+    output_file = f"result_metrics_temp_score.csv"
+    with open(output_file, 'w', newline='', encoding='utf-8') as file:
+        writer = csv.DictWriter(file, fieldnames=result_data.keys())
+        writer.writeheader()  # Write the header
+        writer.writerow(result_data)  # Write the metrics
+
+    print(f"Metrics saved to {output_file}")
+
+run_metrics(actual, prediction)
--- a/test_claimdecomp_evidence_question_mapping_origin.json
+++ b/test_claimdecomp_evidence_question_mapping_origin.json