Skip to content
Snippets Groups Projects
Commit 973461f0 authored by LE Quy thanh's avatar LE Quy thanh
Browse files

Merge branch 'master' into 'main'

Initial

See merge request !1
parents 0eb1d645 32a29539
No related branches found
No related tags found
1 merge request!1Initial
from huggingface_hub import login
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import TrainingArguments
from trl import SFTTrainer
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime
import gc
import os.path
import statistics
import csv
from collections import Counter
import random
import numpy as np
import argparse
import re
from unsloth import FastLanguageModel
import json
with open("origin_train_data.json") as f:
train_data = json.load(f)
model_id = "unsloth/DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit"
put hungging face token here
hf_token = ""
login(hf_token)
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_id,
max_seq_length = 8196,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = hf_token,
)
def get_features(data):
samples = []
system = """
You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
- Respond "True" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
- Respond "False" if:
- The relevant evidence contradicts or disproves the claim.
- The claim is misleading based on the relevant evidence.
- The evidence is too weak or insufficient to support the claim.
- Respond "Half True/False" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
Always adhere to the following rules:
- Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
- Avoid repeating yourself.
"""
prompt = "Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label."
for index, fact in enumerate(data):
sample = {}
sample["response"] = "Verdict: " + fact["label"]
claim = fact["claim"]
evidences = []
questions = []
fact["input_str"] = prompt + "\n\nClaim: " + claim + "\n\n" + "\n\n".join([
f"Q{i+1}: {qa['questions']}\nA{i+1}: {qa['top_k_doc'][0] if qa['top_k_doc'] else 'No answer could be found.'}"
for i, qa in enumerate(fact["evidences"])
])
sample["system"] = system
# "No answer could be found."
for question in fact["evidences"]:
if len(question["top_k_doc"])>0:
evidences.append(question["top_k_doc"][0])
questions.append(question["questions"])
sample["prompt"] = fact["input_str"]
samples.append(sample)
return samples
train_features = get_features(train_data)
train_features = pd.DataFrame(train_features)
train_data = Dataset.from_pandas(train_features)
def format_chat_template(example):
example_json = [
{"role": "system", "content": example["system"],},
{"role": "user", "content": example["prompt"]},
{"role": "assistant", "content": example["response"]}]
example["text"] = tokenizer.apply_chat_template(example_json, tokenize=False)
return example
train_data = train_data.map(format_chat_template, num_proc= 4)
model = FastLanguageModel.get_peft_model(
model,
r=64,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=16,
lora_dropout=0.1,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
)
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_data,
dataset_text_field="text",
max_seq_length=8196,
dataset_num_proc=2,
args=TrainingArguments(
per_device_train_batch_size=2,
warmup_steps=5,
num_train_epochs=2,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=10,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
),
)
import os
from datetime import datetime
from pathlib import Path
import shutil
import os
model_id_clean = re.sub(r'[\\/#!?%:]', '_', model_id)
saved_model_name = "model"
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
trainer_stats = trainer.train()
model.save_pretrained(f"model_{model_id_clean}") # Local saving
tokenizer.save_pretrained(f"model_{model_id_clean}")
print(f"Results saved to model_{model_id_clean}")
\ No newline at end of file
system,prompt,label,claim,response,prediction
"
You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
- Respond ""True"" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
- Respond ""False"" if:
- The relevant evidence contradicts or disproves the claim.
- The claim is misleading based on the relevant evidence.
- The evidence is too weak or insufficient to support the claim.
- Respond ""Half True/False"" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
Always adhere to the following rules:
- Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
- Avoid repeating yourself.
","<|begin▁of▁sentence|>
You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
- Respond ""True"" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
- Respond ""False"" if:
- The relevant evidence contradicts or disproves the claim.
- The claim is misleading based on the relevant evidence.
- The evidence is too weak or insufficient to support the claim.
- Respond ""Half True/False"" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
Always adhere to the following rules:
- Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
- Avoid repeating yourself.
<|User|>Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label.
Claim: Says David Perdue wants to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans.
Q1: does david perdue want to abolish the u.s. department of education?
A1: last week, kentucky rep. thomas massie introduced a bill to abolish the education department, and it's a short, one-sentence read: ""the department of education shall terminate on december 31, 2023.""
Q2: would abolishing the u.s. department of education cost georgia schools $1 billion a year?
A2: 27 juil. 2022 georgia's fy 2022 education budget sends $10.2 billion to k-12 public schools, maintaining a budget cut of $383 million.
Q3: would abolishing the u.s. department of education eliminate millions in student loans?
A3: in student loans would be at risk; 23.9 million* low-income students abolishment of the u.s. department of education. at the time, he and<|Assistant|><think>
",Conflicting,"Says David Perdue wants to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans.","<|begin▁of▁sentence|><|begin▁of▁sentence|>
You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
- Respond ""True"" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
- Respond ""False"" if:
- The relevant evidence contradicts or disproves the claim.
- The claim is misleading based on the relevant evidence.
- The evidence is too weak or insufficient to support the claim.
- Respond ""Half True/False"" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
Always adhere to the following rules:
- Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
- Avoid repeating yourself.
<|User|>Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label.
Claim: Says David Perdue wants to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans.
Q1: does david perdue want to abolish the u.s. department of education?
A1: last week, kentucky rep. thomas massie introduced a bill to abolish the education department, and it's a short, one-sentence read: ""the department of education shall terminate on december 31, 2023.""
Q2: would abolishing the u.s. department of education cost georgia schools $1 billion a year?
A2: 27 juil. 2022 georgia's fy 2022 education budget sends $10.2 billion to k-12 public schools, maintaining a budget cut of $383 million.
Q3: would abolishing the u.s. department of education eliminate millions in student loans?
A3: in student loans would be at risk; 23.9 million* low-income students abolishment of the u.s. department of education. at the time, he and<|Assistant|><think>
Okay, so I need to figure out whether the claim about David Perdue wanting to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion a year and eliminate millions in student loans, is True, False, or Half True/False based on the provided Q&A evidence.
First, let's break down the claim into parts. The claim has three main points:
1. David Perdue wants to abolish the U.S. Department of Education.
2. Abolishing the Department would cost Georgia schools $1 billion a year.
3. It would eliminate millions in student loans.
Now, looking at the evidence provided:
Q1 asks if David Perdue wants to abolish the Department. The answer says that Kentucky Rep. Thomas Massie introduced a bill to abolish it, not David Perdue. So, the evidence doesn't mention Perdue wanting to do this. That's a problem because the claim attributes this desire to Perdue, but the evidence points to someone else.
Q2 asks if abolishing the Department would cost Georgia $1 billion. The answer mentions Georgia's FY 2022 budget sent $10.2 billion to K-12 schools but doesn't specify how much comes from the federal Department of Education. It also mentions a budget cut of $383 million, but that's a state-level cut, not necessarily from abolishing the Department. So, the evidence doesn't directly support the $1 billion figure.
Q3 asks about student loans. The answer says that 23.9 million low-income students would be at risk if the Department is abolished. However, it doesn't specify whether this would eliminate millions in loans or how exactly the loans would be affected. It's a bit vague on the impact.
Putting it all together, the evidence doesn't support the first part of the claim about Perdue wanting to abolish the Department. It also doesn't provide clear evidence that Georgia would lose $1 billion or that millions in loans would be eliminated. The information is either about someone else (Massie) or not specific enough about the financial impacts.
Therefore, the claim isn't fully supported by the evidence. It's misleading because it attributes the desire to Perdue when the evidence points to Massie, and it doesn't back up the financial figures. So, the verdict should be False.
</think>
**Verdict: False**
**Step-by-Step Explanation:**
1. **Claim Analysis:** The claim attributes three actions to David Perdue: wanting to abolish the U.S. Department of Education, which would cost Georgia schools $1 billion annually and eliminate millions in student loans.
2. **Evidence Review:**
- **Q1/A1:** Indicates Rep. Thomas Massie introduced the bill, not David Perdue.
- **Q2/A2:** Georgia's budget details don't specify federal contributions or a $1 billion loss.
- **Q3/A3:** Vague on the impact of abolishing the Department on student loans.
3. **Conclusion:** The evidence contradicts the claim regarding Perdue's involvement and lacks support for the financial impacts stated. Thus, the claim is misleading and unsupported.
**Final Answer:** False<|end▁of▁sentence|>",False
This diff is collapsed.
result_metrics_confusion_matrix.png

18.8 KiB

Weighted F1 Score,Macro F1 Score,Accuracy
0.0,0.0,0.0
test.py 0 → 100644
from huggingface_hub import login
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime
import csv
from collections import Counter
import random
import numpy as np
import argparse
import re
import json
model_id = "unsloth_DeepSeek-R1-Distill-Qwen-32B-unsloth-bnb-4bit"
put hunging face token here
hf_token = ""
login(hf_token)
from unsloth import FastLanguageModel
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_id,
max_seq_length = 8196,
dtype = dtype,
load_in_4bit = load_in_4bit,
token = hf_token,
)
with open("test_claimdecomp_evidence_question_mapping_origin.json") as f:
test_data = json.load(f)
def get_features_test(data):
samples = []
prompt = "Predict the verdict of a claim based on the provided question-answer pair evidence. Choose from the labels: 'True', 'False', 'Half True/False'. Disregard irrelevant question-answer pairs when assessing the claim. Justify your decision step by step using the provided evidence and select the appropriate label."
system = """
You are a fact-checker. You have done a fact-check to verify a Claim based on the provided question-answer pair evidence.
Your task is to predict the verdict of a claim based on the provided question-answer pair evidence whether the Claim is one of the labels: 'True', 'False', 'Half True/False'. Do this by following:
- Respond "True" only if the relevant evidence fully or almost fully supports and verifies the claim as correct.
- Respond "False" if:
- The relevant evidence contradicts or disproves the claim.
- The claim is misleading based on the relevant evidence.
- The evidence is too weak or insufficient to support the claim.
- Respond "Half True/False" if the evidence is ambiguous, incomplete, or inconclusive, making it impossible to determine if the claim is fully true or false.
Always adhere to the following rules:
- Use information only from the recorded evidence: Avoid inserting information that is not implied by the evidence. You may use commonsense knowledge, though.
- Avoid repeating yourself.
"""
for fact in data: # Giả sử bạn có một danh sách dữ liệu trong `data`
claim = fact.get("claim", "") # Lấy claim từ mỗi fact
seen_docs = set() # Tạo tập hợp để lưu các câu trả lời đã xuất hiện
filtered_qa = [] # Danh sách mới chứa các Q/A hợp lệ
for qa in fact.get("evidences", []):
top_doc = qa['top_k_doc'][0] if qa['top_k_doc'] else ''
if top_doc and top_doc in seen_docs:
# print(top_doc)
continue # Bỏ qua Q và A nếu top_k_doc[0] đã xuất hiện
seen_docs.add(top_doc) # Đánh dấu tài liệu đã xuất hiện
filtered_qa.append(qa) # Thêm vào danh sách hợp lệ
fact["input_str"] = (
prompt + "\n\nClaim: " + claim + "\n\n" + "\n\n".join([
f"Q{i+1}: {qa['questions']}\nA{i+1}: {qa['top_k_doc'][0] if qa['top_k_doc'] else 'No answer could be found.'}"
for i, qa in enumerate(filtered_qa)
])
)
# print(fact["input_str"])
sample = {} # Khởi tạo sample nếu chưa có
sample["system"] = system
sample["prompt"] = fact["input_str"]
sample["label"] = fact["label"]
sample["claim"] = fact["claim"]
samples.append(sample)
return samples
test_features = get_features_test(test_data)
test_features = pd.DataFrame(test_features)
# test_features = test_features[35:36]
def generate_response(sample, model):
inputs = tokenizer([sample['prompt']], return_tensors='pt').to("cuda")
outputs = model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=3000,
use_cache=False, temperature= 0.1
)
response = tokenizer.batch_decode(outputs)
sample['response'] = response[0]
if "Verdict:" in response[0]:
response = response[0].split("Verdict:")[1]
else:
print("Không tìm thấy 'Verdict:' trong response.")
print(response[0])
response = "" # Hoặc xử lý tùy ý
if "Half True/False" in response:
sample["prediction"] = "Conflicting"
elif "False" in response:
sample["prediction"] = "False"
elif "True" in response:
sample["prediction"] = "True"
else:
print("Error: ", response)
sample["prediction"] = "Conflicting"
# exit(0)
return sample
def format_chat_template_test(example):
example_json = [
{"role": "system", "content": example["system"]},
{"role": "user", "content": example["prompt"]}
]
example["prompt"] = tokenizer.apply_chat_template(example_json, tokenize=False, add_generation_prompt=True)
return example
test_data = test_features.apply(format_chat_template_test, axis=1)
FastLanguageModel.for_inference(model)
from tqdm import tqdm
tqdm.pandas() # Khởi tạo tqdm cho pandas
test_data = test_data.progress_apply(lambda sample: generate_response(sample, model), axis=1)
import os
from datetime import datetime
from pathlib import Path
model_id_clean = re.sub(r'[\\/#!?%:]', '_', model_id) # Xử lý trước
saved_model_name = "model"
# Lấy thời gian hiện tại và định dạng thành chuỗi
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_data = test_data
# Chuyển thành DataFrame
df = pd.DataFrame(test_data)
# Lưu DataFrame vào CSV
output_file = f"log_{model_id_clean}.csv"
df.to_csv(output_file, index=False)
print(f"log saved to {output_file}")
actual = test_data['label']
prediction = test_data['prediction']
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
def run_metrics( actual, prediction):
"""Run the metrics based on the loaded config."""
print("Running metrics with the following settings:")
# Calculate metrics
accuracy = accuracy_score(actual, prediction)
m_f1 = f1_score(actual, prediction, average='macro') # Use 'macro' for multi-class
w_f1 = f1_score(actual, prediction, average='weighted')
metrics = {
"W-F1 Score": w_f1,
"M-F1 Score": m_f1,
"Accuracy": accuracy
}
cm = confusion_matrix(actual, prediction)
labels = ["Conflicting", "False", "True"]
# Plot confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.savefig(f"result_metrics_confusion_matrix.png")
plt.show()
# Check each metric and print if it's enabled in config
result_data = {}
print(metrics)
# Check each metric and store if it's enabled in config
result_data['Weighted F1 Score'] = metrics['W-F1 Score']
result_data['Macro F1 Score'] = metrics['M-F1 Score']
result_data['Accuracy'] = metrics['Accuracy']
output_file = f"result_metrics_temp_score.csv"
with open(output_file, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=result_data.keys())
writer.writeheader() # Write the header
writer.writerow(result_data) # Write the metrics
print(f"Metrics saved to {output_file}")
run_metrics(actual, prediction)
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment