Commit 853d9bea by tongtao.ling

update model.py

parent 29a4e564
__pycache__/ __pycache__/
dataset/ dataset/
output/ output/
pretrain_data/ pretrain_data/
\ No newline at end of file runs/
test_data
*.ipynb
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"vocab_size":272833, "vocab_size":272833,
"d_model": 256, "d_model": 256,
"num_attention_heads":8, "num_attention_heads":8,
"max_position_embeddings":128, "max_position_embeddings":256,
"hidden_dropout_prob":0.1, "hidden_dropout_prob":0.1,
"attention_probs_dropout_prob":0.1, "attention_probs_dropout_prob":0.1,
"layer_norm_eps":1e-12, "layer_norm_eps":1e-12,
......
import onnx
import os
import torch
import random
import onnxruntime
import numpy as np
from pprint import pprint
from config import CTConfig
from tokenizer import CTTokenizer
from model import CTTransformerForPreTraining
from onnxruntime.quantization import quantize_dynamic, QuantType
config = CTConfig("./config.json")
tokenizer = CTTokenizer("./vocab.json")
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
num_labels = len(label_list)
config.label_list = label_list
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
model_path = "./output/finetune_output/best_checkpoint.pt"
state_dict = torch.load(model_path, weights_only=True, map_location="cuda:0")
model.load_state_dict(state_dict)
model.eval()
def generate_inputs(batch_size, max_len):
input_ids = []
for _ in range(batch_size):
seq_length = random.randint(5,10)
token_ids = np.random.randint(1,config.vocab_size,(seq_length,))
token_ids = [1] + token_ids.tolist() + [2]
token_ids = np.array(token_ids)
token_ids = np.append(token_ids,np.zeros(max_len-len(token_ids)),axis=0)
input_ids.append(torch.tensor(token_ids).long())
input_ids = torch.stack(input_ids)
attention_mask = (input_ids != 0).long()
print(input_ids)
print(attention_mask)
return input_ids, attention_mask
def export_onnx(output_dir):
# 导出模型为 ONNX 格式
batch_size = 1 #批处理大小
sequence_length = 256
dynamic_axes= {"input_ids":{0:"batch_size",1:"sequence_length"},
"attention_mask":{0:"batch_size",1:"sequence_length"},
"logits":{0:"batch_size",1:"sequence_length"},
}
token_ids, attention_mask = generate_inputs(batch_size, sequence_length)
dummy_input = {
"input_ids": token_ids,
"attention_mask":attention_mask
}
torch.onnx.export(model, dummy_input, os.path.join(output_dir,'model.onnx'),
export_params=True,
training=torch.onnx.TrainingMode.EVAL,
# opset_version=15,
do_constant_folding=True,
input_names=['input_ids','attention_mask'],
output_names=['logits'],
dynamic_axes=dynamic_axes,
verbose=True)
def quantize(model_fp32, model_quant):
quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QInt8)
def printinfo(onnx_session):
print("----------------- 输入部分 -----------------")
input_tensors = onnx_session.get_inputs() # 该 API 会返回列表
for input_tensor in input_tensors: # 因为可能有多个输入,所以为列表
input_info = {
"name" : input_tensor.name,
"type" : input_tensor.type,
"shape": input_tensor.shape,
}
pprint(input_info)
print("----------------- 输出部分 -----------------")
output_tensors = onnx_session.get_outputs() # 该 API 会返回列表
for output_tensor in output_tensors: # 因为可能有多个输出,所以为列表
output_info = {
"name" : output_tensor.name,
"type" : output_tensor.type,
"shape": output_tensor.shape,
}
pprint(output_info)
if __name__ == "__main__":
output_dir = "./output/finetune_output"
export_onnx(output_dir)
model = onnx.load(os.path.join(output_dir,"model.onnx"))
onnx.checker.check_model(model)
print(onnx.helper.printable_graph(model.graph))
ort_session = onnxruntime.InferenceSession(os.path.join(output_dir,"model.onnx"))
printinfo(ort_session)
batch_size = 1 #批处理大小
sequence_length = 256
token_ids, attention_mask = generate_inputs(batch_size, sequence_length)
outputs = ort_session.run(None, {
"input_ids": token_ids.numpy(),
"attention_mask":attention_mask.numpy(),
})
print(outputs[0].shape)
model_fp32 = os.path.join(output_dir,"model.onnx")
model_quant = os.path.join(output_dir,"model_quant.onnx")
quantize(model_fp32, model_quant)
\ No newline at end of file
...@@ -8,19 +8,14 @@ import pandas as pd ...@@ -8,19 +8,14 @@ import pandas as pd
import string import string
import threading import threading
from tqdm import tqdm from tqdm import tqdm
from tokenizer import Tokenizer from glob import glob
from tokenizer import CTTokenizer
chinese_punctuation = ",。!?;:()【】《》“”‘’、" chinese_punctuation = ",。!?;:()【】《》“”‘’、"
punctuation_list = string.punctuation + chinese_punctuation punctuation_list = string.punctuation + chinese_punctuation
interregnum_list = ['吗', '吧', '呀', '呃', '呐', '呗', '呢', '呵', '哇', '哉', '哎', '哩', '唔', '唸', '啦', '啵', '嘛', '嘞', '欤', '啊', '哦', '恩', '嗯'] interregnum_list = ['吗', '吧', '呀', '呃', '呐', '呗', '呢', '呵', '哇', '哉', '哎', '哩', '唔', '唸', '啦', '啵', '嘛', '嘞', '欤', '啊', '哦', '恩', '嗯']
# interregnum_list = [token for token, pos in dt.word_tag_tab.items() if pos == "y"] tokenizer = CTTokenizer("vocab.json")
# interregnum_list.extend(["啊","哦","哎","恩","嗯"])
# interregnum_list.remove("吧")
# tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenizer = Tokenizer("vocab.json")
def process_text(sent): def process_text(sent):
sent = sent.replace("\n","").replace("\\n","").replace("|","") sent = sent.replace("\n","").replace("\\n","").replace("|","")
...@@ -136,9 +131,8 @@ def create_csc(data_dir, output_dir): ...@@ -136,9 +131,8 @@ def create_csc(data_dir, output_dir):
for split in ["train","dev","test"]: for split in ["train","dev","test"]:
# for split in ["test"]: # for split in ["test"]:
data = json.load(open(os.path.join(data_dir, f"{split}.json"),"r",encoding="utf-8")) data = json.load(open(os.path.join(data_dir, f"{split}.json"),"r",encoding="utf-8"))
data = data[:100]
token_label_data = [] token_label_data = []
for item in data: for item in tqdm(data):
# sentence = "地处武林路,相当繁华的地带啊,内部环境不错,很干净,地上的头发会当即清扫。发型师的服务态度也很好,当然价格是比较高的," # sentence = "地处武林路,相当繁华的地带啊,内部环境不错,很干净,地上的头发会当即清扫。发型师的服务态度也很好,当然价格是比较高的,"
sentence = item["correct_text"] sentence = item["correct_text"]
# sentence = process_text(sentence) # sentence = process_text(sentence)
...@@ -158,12 +152,24 @@ def create_csc(data_dir, output_dir): ...@@ -158,12 +152,24 @@ def create_csc(data_dir, output_dir):
for i in token_label_data: for i in token_label_data:
f.write(str(i)+"\n") f.write(str(i)+"\n")
def create_lang_8(data_dir, output_dir): def remove_non_chinese_english_chars(text):
# 定义要保留的字符范围
chinese_characters = r'\u4e00-\u9fff' # 基本汉字
english_characters = r'a-zA-Z'
punctuation = re.escape('''!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~,。!?、;:“”‘’《》〈〉【】〔〕()‥·°〃※》々◦〝〞〃〄〆''')
# 正则表达式模式
pattern = f'[^{chinese_characters}{english_characters}{punctuation}]+'
# 使用正则表达式替换非保留字符
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def create_lang_8(data, output_dir, output_name):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.mkdir(output_dir) os.mkdir(output_dir)
data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
print(f"data size: {len(data)}")
token_label_data = [] token_label_data = []
for item in tqdm(data): for item in tqdm(data):
sentence = item.split("\t")[-1] sentence = item.split("\t")[-1]
...@@ -180,7 +186,7 @@ def create_lang_8(data_dir, output_dir): ...@@ -180,7 +186,7 @@ def create_lang_8(data_dir, output_dir):
# train_data = token_label_data[:split_index] # train_data = token_label_data[:split_index]
# dev_data = token_label_data[split_index:] # dev_data = token_label_data[split_index:]
with open(os.path.join(output_dir, "data.txt"),"w",encoding="utf-8") as f: with open(os.path.join(output_dir, f"{output_name}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data: for i in token_label_data:
f.write(str(i)+"\n") f.write(str(i)+"\n")
# with open(os.path.join(output_dir, "dev.txt"),"w",encoding="utf-8") as f: # with open(os.path.join(output_dir, "dev.txt"),"w",encoding="utf-8") as f:
...@@ -201,27 +207,110 @@ def create_lang_8(data_dir, output_dir): ...@@ -201,27 +207,110 @@ def create_lang_8(data_dir, output_dir):
# for i in test_data: # for i in test_data:
# f.write(str(i)+"\n") # f.write(str(i)+"\n")
# class DatasetThread(threading.Thread): #继承父类threading.Thread def create_baike(data, output_dir, output_name):
# def __init__(self, threadID, name, counter):
# threading.Thread.__init__(self) if not os.path.exists(output_dir):
# self.threadID = threadID os.mkdir(output_dir)
# self.name = name
# self.counter = counter token_label_data = []
# def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数 for item in tqdm(data):
# print("Starting:",self.name) item = eval(item)
# create_csc("./csc", "./data") answer = item["answer"]
# print "Exiting " + self.name answer = answer.replace("\r","").replace("\n","")
sentence = process_text(answer)
token_label = create_repetitions(sentence)
if token_label == None:
continue
token_label_data.append(token_label)
with open(os.path.join(output_dir, f"{output_name}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
def split_list(lst, n):
"""
将列表 lst 切分成 n 份,如果不能整除则最后一份可能会稍短。
:param lst: 待切分的列表
:param n: 切分的份数
:return: 切分后的列表集合
"""
# 计算每一份的长度
k, m = divmod(len(lst), n)
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def create_wiki(data_dir, output_dir, output_name):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
token_label_data = []
files_dir = glob(data_dir+"/*")
for file in tqdm(files_dir):
lines = open(file,"r",encoding="utf-8").readlines()
for line in lines:
line = eval(line)
text = line["text"]
docs = text.split("\n\n")
# print(docs)
sentences = []
for doc in docs:
# print(doc)
if len(doc) < 10:
continue
doc = remove_non_chinese_english_chars(doc)
doc = doc.replace("\n","").replace(" ","")
doc = doc.lstrip()
doc = doc.rstrip()
sentences.append(doc)
for sentence in sentences:
# print(sentence)
token_label = create_repetitions(sentence)
if token_label == None:
continue
token_label_data.append(token_label)
with open(os.path.join(output_dir, f"{output_name}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
class DatasetThread(threading.Thread): #继承父类threading.Thread
def __init__(self, threadID, name, data_dir, output_dir):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.output_dir = output_dir
self.data_dir = data_dir
def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
print("Starting:",self.name)
# create_baike(self.data, self.output_dir, self.name)
create_wiki(self.data_dir, output_dir, self.name)
print("Exiting:",self.name)
if __name__ == "__main__": if __name__ == "__main__":
create_lang_8("./lang-8", "./pretrain_data") # data_dir = "./dataset/baike2018qa/baike_qa_train.json"
# create_csc("./csc", "./data") # data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
# 创建两个线程 # data = pd.read_csv(data_dir)
# try:
# thread.start_new_thread( print_time, ("Thread-1", 2, ) )
# thread.start_new_thread( print_time, ("Thread-2", 4, ) )
# except:
# print "Error: unable to start thread"
# while 1: # data = open(data_dir,"r",encoding="utf-8").readlines()
# pass
# print(f"data size: {len(data)}")
wiki_dir = "./dataset/wiki_zh"
wiki_files = os.listdir(wiki_dir)
thread_num = len(wiki_files)
output_dir = "./pretrain_data"
# print(wiki_files[0])
# create_wiki(os.path.join(wiki_dir,wiki_files[0]), output_dir, wiki_files[0])
# dataset = split_list(data, thread_num)
output_dir = "./pretrain_data"
for i in range(thread_num):
thread = DatasetThread(i+1, wiki_files[i], os.path.join(wiki_dir,wiki_files[i]), output_dir)
thread.start()
# create_csc("./dataset/csc", "./dataset/finetune_data")
import os import os
import logging import logging
import torch import torch
from torch import nn
from transformers import AdamW, get_linear_schedule_with_warmup from transformers import AdamW, get_linear_schedule_with_warmup
import pandas as pd import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer from tokenizer import CTTokenizer
from config import CTConfig from config import CTConfig
from model import CTTransformerForPreTraining from model import CTTransformerForPreTraining
...@@ -16,6 +15,8 @@ import argparse ...@@ -16,6 +15,8 @@ import argparse
from tqdm import tqdm, trange from tqdm import tqdm, trange
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/finetune')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -33,49 +34,57 @@ class CTDataset(Dataset): ...@@ -33,49 +34,57 @@ class CTDataset(Dataset):
def __init__(self, config, data_dir, tokenizer, max_length): def __init__(self, config, data_dir, tokenizer, max_length):
self.data = open(data_dir,"r",encoding="utf-8").readlines() self.data = open(data_dir,"r",encoding="utf-8").readlines()
self.texts, self.labels = self.read_data(self.data) self.tokens, self.labels = self.read_data(self.data)
# self.label_list = label_list # self.label_list = label_list
# self.label2id = label2id self.label2id = label2id
self.id2label = id2label # self.id2label = id2label
# self.num_labels = num_labels # self.num_labels = num_labels
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.max_length = max_length self.max_length = max_length
self.pad_label_id = config.pad_label_id self.pad_token_id = config.pad_token_id # 0
self.pad_label_id = config.pad_label_id # -100
def read_data(self, data): def read_data(self, data):
texts, labels = [], [] tokens, labels = [], []
for sent in data: for sent in tqdm(data):
sent = eval(sent) sent = eval(sent)
text, label = [], [] text, label = [], []
for token_label in sent: for token_label in sent:
token, tag = token_label.split("|") token, tag = token_label.split("|")
text.append(token) text.append(token)
label.append(tag) label.append(tag)
texts.append(text) tokens.append(text)
labels.append(label) labels.append(label)
return texts, labels return tokens, labels
def __len__(self): def __len__(self):
return len(self.texts) return len(self.tokens)
def __getitem__(self, idx): def __getitem__(self, idx):
text = self.texts[idx] tokens = self.tokens[idx]
label = self.labels[idx] labels = self.labels[idx]
assert len(text) == len(label) assert len(tokens) == len(labels)
token_ids, attention_mask = self.tokenizer( tokens = tokens[:self.max_length - 2] if len(tokens) > self.max_length - 2 else tokens
text, labels = labels[:self.max_length - 2] if len(labels) > self.max_length - 2 else labels
# padding='max_length',
# truncation=True, label_ids = [ self.label2id[i] for i in labels]
max_length=self.max_length,
is_split_into_words=True, tokens = [self.tokenizer.sentence_start_token] + tokens + [self.tokenizer.sentence_end_token]
# return_tensors='pt' label_ids = [self.pad_label_id] + label_ids + [self.pad_label_id]
) attention_mask = [1] * len(tokens)
label_ids = [ label2id[i] for i in label]
padding_length = self.max_length - len(tokens)
tokens.extend([self.tokenizer.padding_token] * padding_length)
attention_mask.extend([0] * padding_length)
token_ids = self.tokenizer.tokens2ids(tokens)
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids))) label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
inputs = { inputs = {
...@@ -86,22 +95,6 @@ class CTDataset(Dataset): ...@@ -86,22 +95,6 @@ class CTDataset(Dataset):
return inputs return inputs
# def compute_metrics(predictions, labels):
# true_predictions = [
# [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# true_labels = [
# [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# precision=precision_score(true_labels, true_predictions, average='micro')
# recall=recall_score(true_labels, true_predictions, average='micro')
# f1=f1_score(true_labels, true_predictions, average='micro')
# return precision, recall, f1
def train(args, model, train_dataset, dev_dataset): def train(args, model, train_dataset, dev_dataset):
args.train_batch_size = args.per_gpu_train_batch_size args.train_batch_size = args.per_gpu_train_batch_size
...@@ -160,7 +153,7 @@ def train(args, model, train_dataset, dev_dataset): ...@@ -160,7 +153,7 @@ def train(args, model, train_dataset, dev_dataset):
outputs = model(token_ids, attention_mask, label_ids) outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0] loss = outputs[0]
writer.add_scalar('train_loss', loss, global_step)
if args.gradient_accumulation_steps > 1: if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps loss = loss / args.gradient_accumulation_steps
...@@ -176,12 +169,15 @@ def train(args, model, train_dataset, dev_dataset): ...@@ -176,12 +169,15 @@ def train(args, model, train_dataset, dev_dataset):
global_step += 1 global_step += 1
if args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.logging_steps > 0 and global_step % args.logging_steps == 0:
if args.evaluate_during_training: if args.evaluate_during_training:
f1, _, _ = evaluate(args, model, dev_dataset) f1, eval_loss, _ = evaluate(args, model, dev_dataset)
writer.add_scalar('f1', f1, global_step)
writer.add_scalar('eval_loss', eval_loss, global_step)
if best_f1 < f1: if best_f1 < f1:
best_f1 = f1 best_f1 = f1
if not os.path.exists(args.output_dir): if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir) os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/best_checkpoint.pt" save_dir = os.path.join(args.output_dir, "best_checkpoint.pt")
torch.save(model.state_dict(), save_dir) torch.save(model.state_dict(), save_dir)
logger.info("Saving best checkpoint to %s", save_dir) logger.info("Saving best checkpoint to %s", save_dir)
...@@ -192,7 +188,7 @@ def train(args, model, train_dataset, dev_dataset): ...@@ -192,7 +188,7 @@ def train(args, model, train_dataset, dev_dataset):
best_f1 = f1 best_f1 = f1
if not os.path.exists(args.output_dir): if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir) os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/best_checkpoint.pt" save_dir = os.path.join(args.output_dir, "best_checkpoint.pt")
torch.save(model.state_dict(), save_dir) torch.save(model.state_dict(), save_dir)
logger.info("Saving best checkpoint to %s", save_dir) logger.info("Saving best checkpoint to %s", save_dir)
...@@ -280,11 +276,11 @@ def set_seed(args): ...@@ -280,11 +276,11 @@ def set_seed(args):
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", default="./data", type=str) parser.add_argument("--data_dir", default="./dataset/finetune_data", type=str)
parser.add_argument("--model_path", default="./outputs/model.pt", type=str) parser.add_argument("--model_path", default="./output/pretrain_output/model.pt", type=str)
parser.add_argument("--output_dir", default='./ft_outputs/', type=str) parser.add_argument("--output_dir", default='./output/finetune_output/', type=str)
parser.add_argument("--max_seq_length", default=128, type=int) parser.add_argument("--max_seq_length", default=256, type=int)
parser.add_argument("--do_train", action="store_true", parser.add_argument("--do_train", action="store_true",
help="Whether to run training.") help="Whether to run training.")
...@@ -295,9 +291,9 @@ def parse_args(): ...@@ -295,9 +291,9 @@ def parse_args():
help="Whether to run evaluation during training at each logging step.") help="Whether to run evaluation during training at each logging step.")
parser.add_argument("--evaluate_after_epoch", action="store_true", parser.add_argument("--evaluate_after_epoch", action="store_true",
help="Whether to run evaluation after each epoch.") help="Whether to run evaluation after each epoch.")
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, parser.add_argument("--per_gpu_train_batch_size", default=64, type=int,
help="Batch size per GPU/CPU for training.") help="Batch size per GPU/CPU for training.")
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int,
help="Batch size per GPU/CPU for evaluation.") help="Batch size per GPU/CPU for evaluation.")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1, parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
...@@ -334,7 +330,7 @@ if __name__ == "__main__": ...@@ -334,7 +330,7 @@ if __name__ == "__main__":
if os.path.exists(args.output_dir): if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}") os.system(f"rm -rf {args.output_dir}")
os.system(f"rm -rf ./run/finetune")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.device = device args.device = device
...@@ -347,10 +343,11 @@ if __name__ == "__main__": ...@@ -347,10 +343,11 @@ if __name__ == "__main__":
config = CTConfig('config.json') config = CTConfig('config.json')
# 加载Tokenizer和模型 # 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json') tokenizer = CTTokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"] label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
# label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM"]
label2id = {j:i for i,j in enumerate(label_list)} label2id = {j:i for i,j in enumerate(label_list)}
id2label = {i:j for i,j in enumerate(label_list)} id2label = {i:j for i,j in enumerate(label_list)}
num_labels = len(label_list) num_labels = len(label_list)
...@@ -380,7 +377,7 @@ if __name__ == "__main__": ...@@ -380,7 +377,7 @@ if __name__ == "__main__":
if not os.path.exists(args.output_dir): if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir) os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/last_checkpoint.pt" save_dir = os.path.join(args.output_dir, "last_checkpoint.pt")
torch.save(model.state_dict(), save_dir) torch.save(model.state_dict(), save_dir)
logger.info("Saving last checkpoint to %s", save_dir) logger.info("Saving last checkpoint to %s", save_dir)
...@@ -396,5 +393,5 @@ if __name__ == "__main__": ...@@ -396,5 +393,5 @@ if __name__ == "__main__":
f1, _, overall_result = evaluate(args, model, test_dataset) f1, _, overall_result = evaluate(args, model, test_dataset)
output_eval_file = os.path.join(args.output_dir, "test_results.txt") output_eval_file = os.path.join(args.output_dir, "test_results.txt")
with open(output_eval_file, "a") as writer: with open(output_eval_file, "a") as writer:
writer.write('***** Predict in test dataset *****') writer.write('***** Predict in test dataset ***** \n')
writer.write("{} \n".format(overall_result)) writer.write("{} \n".format(overall_result))
\ No newline at end of file
#!/bin/bash #!/bin/bash
export CUDA_VISIBLE_DEVICES=3
OUTPUT_DIR='./output/finetune_output' OUTPUT_DIR='./output/finetune_output'
MODEL_PATH='./output/pretrain_output/model.pt' MODEL_PATH='./output/model.pt'
CUDA_VISIBLE_DEVICES='2' python fintune.py \ python finetune.py \
--do_train \ --do_train \
--do_eval \ --do_eval \
--model_path $MODEL_PATH \ --model_path $MODEL_PATH \
--output_dir $OUTPUT_DIR \ --output_dir $OUTPUT_DIR \
--evaluate_after_epoch \ --evaluate_after_epoch \
--per_gpu_train_batch_size 42 \ --per_gpu_train_batch_size 64 \
--per_gpu_eval_batch_size 42 \ --per_gpu_eval_batch_size 64 \
--dropout_prob 0.1 \ --dropout_prob 0.1 \
--max_seq_length 256 \ --max_seq_length 256 \
--learning_rate 3e-5 \ --learning_rate 3e-5 \
--weight_decay 5e-5 \ --weight_decay 5e-5 \
--num_train_epochs 10 \ --num_train_epochs 10 \
--logging_steps 0.5 \
--seed 42 --seed 42
\ No newline at end of file
...@@ -11,14 +11,12 @@ class CTEmbeddings(nn.Module): ...@@ -11,14 +11,12 @@ class CTEmbeddings(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id) self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id)
# self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
self.register_buffer('position_embeddings', self._get_sinusoid_encoding_table(config.max_position_embeddings, config.d_model)) self.register_buffer('position_embeddings', self._get_sinusoid_encoding_table(config.max_position_embeddings, config.d_model))
self.LayerNorm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps) self.LayerNorm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def _get_sinusoid_encoding_table(self, n_position, d_hid): def _get_sinusoid_encoding_table(self, n_position, d_hid):
''' Sinusoid position encoding table ''' ''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def get_position_angle_vec(position): def get_position_angle_vec(position):
# this part calculate the position In brackets # this part calculate the position In brackets
...@@ -36,8 +34,6 @@ class CTEmbeddings(nn.Module): ...@@ -36,8 +34,6 @@ class CTEmbeddings(nn.Module):
input_ids: torch.Tensor, input_ids: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
word_embeddings = self.word_embeddings(input_ids) word_embeddings = self.word_embeddings(input_ids)
# position_embeddings = self.position_embeddings(position_ids)
# embeddings = word_embeddings + position_embeddings
embeddings = word_embeddings + self.position_embeddings[:, :word_embeddings.size(1)] embeddings = word_embeddings + self.position_embeddings[:, :word_embeddings.size(1)]
embeddings = self.LayerNorm(embeddings) embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings) embeddings = self.dropout(embeddings)
...@@ -56,9 +52,21 @@ class CTSelfAttention(nn.Module): ...@@ -56,9 +52,21 @@ class CTSelfAttention(nn.Module):
self.value = nn.Linear(config.d_model, self.all_head_size) self.value = nn.Linear(config.d_model, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def generate_ct_mask(self, batch_size:Optional[int], seq_len:Optional[int]): def my_triu(self, x: torch.tensor, diagonal: int):
l = x.size(-1)
arange = torch.arange(l)
mask = arange.expand(l,l)
mask = mask-diagonal
arange = arange.unsqueeze(-1)
mask = torch.le(mask, arange)
return mask
def generate_ct_mask(self, batch_size: int, seq_len: int):
mask = torch.ones(seq_len, seq_len, dtype=torch.bool) mask = torch.ones(seq_len, seq_len, dtype=torch.bool)
mask = torch.triu(mask, diagonal=self.max_future_length) # mask = torch.triu(mask, diagonal=self.max_future_length)
mask = self.my_triu(mask, diagonal=self.max_future_length)
mask = torch.stack([mask] * batch_size) mask = torch.stack([mask] * batch_size)
mask = mask.reshape(batch_size,1,seq_len,seq_len) mask = mask.reshape(batch_size,1,seq_len,seq_len)
return mask return mask
...@@ -69,7 +77,7 @@ class CTSelfAttention(nn.Module): ...@@ -69,7 +77,7 @@ class CTSelfAttention(nn.Module):
return x.permute(0, 2, 1, 3) return x.permute(0, 2, 1, 3)
def expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): def expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
""" """
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
""" """
...@@ -95,17 +103,20 @@ class CTSelfAttention(nn.Module): ...@@ -95,17 +103,20 @@ class CTSelfAttention(nn.Module):
key_layer = self.transpose_for_scores(self.key(hidden_states)) key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states))
attention = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention = attention.masked_fill(attention_mask == 0, float("-1e20"))
ct_mask = ct_mask.to(attention_mask.device) ct_mask = ct_mask.to(attention_mask.device)
attention = attention.masked_fill(ct_mask == 0, float("-1e20")) attention = attention.masked_fill(ct_mask == 0, float("-1e20"))
attention = attention.masked_fill(attention_mask == 0, float("-1e20"))
attention = attention / math.sqrt(self.attention_head_size) attention = attention / math.sqrt(self.attention_head_size)
attention_probs = F.softmax(attention, dim=-1) attention_probs = F.softmax(attention, dim=-1)
attention_probs = self.dropout(attention_probs) attention_probs = self.dropout(attention_probs)
# context_layer即attention矩阵与value矩阵的乘积,原始的大小为:(batch_size, num_attention_heads, sequence_length, attention_head_size) ;
context_layer = torch.matmul(attention_probs, value_layer) context_layer = torch.matmul(attention_probs, value_layer)
# context_layer进行转置和view操作以后,形状就恢复了(batch_size, sequence_length, hidden_size)。
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape) context_layer = context_layer.view(new_context_layer_shape)
...@@ -132,6 +143,7 @@ class CTLayer(nn.Module): ...@@ -132,6 +143,7 @@ class CTLayer(nn.Module):
# feed forward block # feed forward block
def _ff_block(self, x: torch.Tensor) -> torch.Tensor: def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
# FFN(x) = max(0, x*W_1+ b_1)*W_2 + b_2
x = self.linear2(self.dropout(self.activation(self.linear1(x)))) x = self.linear2(self.dropout(self.activation(self.linear1(x))))
return self.dropout2(x) return self.dropout2(x)
...@@ -150,8 +162,10 @@ class CTLayer(nn.Module): ...@@ -150,8 +162,10 @@ class CTLayer(nn.Module):
x = hidden_states x = hidden_states
# 残差
x = self.norm1(x + self.dropout1(attention_scores)) x = self.norm1(x + self.dropout1(attention_scores))
# FeedForward
x = self.norm2(x + self._ff_block(x)) x = self.norm2(x + self._ff_block(x))
return x return x
...@@ -198,18 +212,11 @@ class CTTransformer(nn.Module): ...@@ -198,18 +212,11 @@ class CTTransformer(nn.Module):
attention_mask: torch.Tensor, attention_mask: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
# batch_size, seq_len = input_ids.size() embedding_output = self.embeddings(input_ids)
# position_ids = self.generate_position_ids(batch_size, seq_len)
# device = input_ids.device
embedding_output = self.embeddings(
input_ids=input_ids,
# position_ids=position_ids,
)
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, embedding_output,
attention_mask=attention_mask, attention_mask,
) )
last_hidden_state = encoder_outputs[0] last_hidden_state = encoder_outputs[0]
...@@ -233,7 +240,7 @@ class CTTransformerForPreTraining(nn.Module): ...@@ -233,7 +240,7 @@ class CTTransformerForPreTraining(nn.Module):
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
attention_mask: torch.Tensor, attention_mask: torch.Tensor,
labels: torch.Tensor, labels: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor]: ) -> Tuple[torch.Tensor]:
last_hidden_state = self.ct_tranformer(input_ids, attention_mask) last_hidden_state = self.ct_tranformer(input_ids, attention_mask)
......
import os import os
import logging import logging
import torch import torch
from torch import nn
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from torch.utils.tensorboard import SummaryWriter
from torch.optim import Adam from torch.optim import Adam
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer from tokenizer import CTTokenizer
from config import CTConfig from config import CTConfig
from model import CTTransformerForPreTraining from model import CTTransformerForPreTraining
from transformers import get_linear_schedule_with_warmup
import numpy as np import numpy as np
import random import random
import argparse import argparse
from glob import glob
from tqdm import tqdm, trange from tqdm import tqdm, trange
writer = SummaryWriter('runs/pretrain')
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -32,8 +35,7 @@ def collate_fn(batch): ...@@ -32,8 +35,7 @@ def collate_fn(batch):
class CTDataset(Dataset): class CTDataset(Dataset):
def __init__(self, config, tokenizer, max_length): def __init__(self, config, tokenizer, max_length):
self.data = open(config.train_data,"r",encoding="utf-8").readlines() self.tokens, self.labels = self.read_data(config.data_dir)
self.texts, self.labels = self.read_data(self.data)
self.label_list = label_list self.label_list = label_list
self.label2id = label2id self.label2id = label2id
...@@ -44,38 +46,52 @@ class CTDataset(Dataset): ...@@ -44,38 +46,52 @@ class CTDataset(Dataset):
self.max_length = max_length self.max_length = max_length
self.pad_label_id = config.pad_label_id self.pad_label_id = config.pad_label_id
def read_data(self, data): def read_data(self, data_dir):
data_files = glob(os.path.join(data_dir,"*.txt"))
texts, labels = [], [] texts, labels = [], []
for sent in data: for file in data_files:
sent = eval(sent) lines = open(file,"r",encoding="utf-8").readlines()
text, label = [], [] logger.info(f"Loading {file} data!")
for token_label in sent: for line in tqdm(lines):
token, tag = token_label.split("|") sent = eval(line)
text.append(token) text, label = [], []
label.append(tag) for token_label in sent:
texts.append(text) if token_label.count("|") > 1:
labels.append(label) continue
token, tag = token_label.split("|")
text.append(token)
label.append(tag)
texts.append(text)
labels.append(label)
assert len(texts) == len(labels)
return texts, labels return texts, labels
def __len__(self): def __len__(self):
return len(self.texts) return len(self.tokens)
def __getitem__(self, idx): def __getitem__(self, idx):
text = self.texts[idx] tokens = self.tokens[idx]
label = self.labels[idx] labels = self.labels[idx]
assert len(text) == len(label)
tokens = tokens[:self.max_length - 2] if len(tokens) > self.max_length - 2 else tokens
token_ids, attention_mask = self.tokenizer( labels = labels[:self.max_length - 2] if len(labels) > self.max_length - 2 else labels
text,
# padding='max_length', label_ids = [ self.label2id[i] for i in labels]
# truncation=True,
max_length=self.max_length, tokens = [self.tokenizer.sentence_start_token] + tokens + [self.tokenizer.sentence_end_token]
is_split_into_words=True, label_ids = [self.pad_label_id] + label_ids + [self.pad_label_id]
# return_tensors='pt' attention_mask = [1] * len(tokens)
)
label_ids = [ label2id[i] for i in label] padding_length = self.max_length - len(tokens)
tokens.extend([self.tokenizer.padding_token] * padding_length)
attention_mask.extend([0] * padding_length)
token_ids = self.tokenizer.tokens2ids(tokens)
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids))) label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
inputs = { inputs = {
...@@ -103,9 +119,18 @@ def train(args, model, train_dataset): ...@@ -103,9 +119,18 @@ def train(args, model, train_dataset):
# 初始化优化器和学习率调度器 # 初始化优化器和学习率调度器
optimizer = Adam(model.parameters(), lr=args.lr) optimizer = Adam(model.parameters(), lr=args.lr)
# total_steps = len(train_dataloader) // args.epochs # t_total = len(train_dataloader) // args.epochs
# scheduler = get_linear_schedule_with_warmup(
# optimizer, num_warmup_steps=0, num_training_steps=t_total
# )
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.epochs)
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_batch_size)
# logger.info(" Total optimization steps = %d", t_total)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
training_loss = 0 training_loss = 0
global_step = 0 global_step = 0
...@@ -124,6 +149,8 @@ def train(args, model, train_dataset): ...@@ -124,6 +149,8 @@ def train(args, model, train_dataset):
outputs = model(token_ids, attention_mask, label_ids) outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0] loss = outputs[0]
writer.add_scalar('Loss', loss, epoch)
epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6))) epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6)))
if args.n_gpu > 1: if args.n_gpu > 1:
...@@ -132,9 +159,8 @@ def train(args, model, train_dataset): ...@@ -132,9 +159,8 @@ def train(args, model, train_dataset):
loss.backward() loss.backward()
training_loss += loss.item() training_loss += loss.item()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# scheduler.step()
optimizer.step() optimizer.step()
# scheduler.step()
model.zero_grad() model.zero_grad()
global_step += 1 global_step += 1
...@@ -180,8 +206,8 @@ def parse_args(): ...@@ -180,8 +206,8 @@ def parse_args():
parser.add_argument("--epochs", default=3, type=int) parser.add_argument("--epochs", default=3, type=int)
parser.add_argument("--lr", default=5e-5, type=float) parser.add_argument("--lr", default=5e-5, type=float)
parser.add_argument("--per_gpu_batch_size", default=8, type=int) parser.add_argument("--per_gpu_batch_size", default=600, type=int)
parser.add_argument("--max_length", default=128, type=int) parser.add_argument("--max_length", default=256, type=int)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -194,6 +220,7 @@ if __name__ == "__main__": ...@@ -194,6 +220,7 @@ if __name__ == "__main__":
if os.path.exists(args.output_dir): if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}") os.system(f"rm -rf {args.output_dir}")
os.system("rm -rf ./run/pretrain")
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
...@@ -216,7 +243,7 @@ if __name__ == "__main__": ...@@ -216,7 +243,7 @@ if __name__ == "__main__":
config = CTConfig('config.json') config = CTConfig('config.json')
# 加载Tokenizer和模型 # 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json') tokenizer = CTTokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"] label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
...@@ -232,8 +259,13 @@ if __name__ == "__main__": ...@@ -232,8 +259,13 @@ if __name__ == "__main__":
model = CTTransformerForPreTraining(config) model = CTTransformerForPreTraining(config)
model.to(args.device) model.to(args.device)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f'Total number of parameters: {num_params}')
config.train_data = os.path.join(args.data_dir,'data.txt') config.train_data = os.path.join(args.data_dir,'data.txt')
# config.data = open(args.data_dir,"r",encoding='utf-8').readlines()
config.data_dir = args.data_dir
pretrain_dataset = CTDataset(config, tokenizer, args.max_length) pretrain_dataset = CTDataset(config, tokenizer, args.max_length)
train(args, model, pretrain_dataset) train(args, model, pretrain_dataset)
writer.close()
CUDA_VISIBLE_DEVICES="4,5,6,7" python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \ export CUDA_VISIBLE_DEVICES="4,5,6,7"
--epochs 20 \
--per_gpu_batch_size 64 \ python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \
--lr 5e-5 \ --data_dir ./pretrain_data \
--output_dir ./outputs \ --epochs 10 \
--per_gpu_batch_size 500 \
--lr 1e-4 \
--output_dir ./output/pretrain_output \
--max_length 256 \
--seed 2024 --seed 2024
\ No newline at end of file
import json
import os
import glob
import time
import torch
from tqdm import tqdm
from seqeval.metrics import accuracy_score, f1_score, classification_report
from config import CTConfig
from tokenizer import CTTokenizer
from model import CTTransformerForPreTraining
config = CTConfig("./config.json")
tokenizer = CTTokenizer("./vocab.json")
model_path = "./output/finetune_output/best_checkpoint.pt"
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
num_labels = len(label_list)
config.label_list = label_list
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
state_dict = torch.load(model_path, weights_only=True)
model.load_state_dict(state_dict)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
model.eval()
def predict(text):
start_time = time.perf_counter()
max_length=256
# text = "啊嗯对哦,有蓝色有蓝的那个哦,有logo是吧?Logo对花花式咖啡啡"
tokens = tokenizer.tokenize(text)
tokens = tokens[:max_length - 2] if len(tokens) > max_length - 2 else tokens
inputs = tokenizer(text, max_length=max_length, return_pt=True)
output = model(**inputs)
logits = output[1]
preds = logits.argmax(-1).view(-1).tolist()
preds = preds[1:len(tokens)+1]
# print(preds)
result = []
# print(len(tokens))
# print(len(preds))
assert len(tokens) == len(preds)
for token,pred in zip(tokens, preds):
if pred in [0,5,6]:
result.append(token)
# print(''.join(result))
end_time = time.perf_counter()
# 计算运行时间(秒)并转换为毫秒
elapsed_time_ms = (end_time - start_time) * 1000
result = f"文本:{text}, 结果:{''.join(result)}, 运行时间: {elapsed_time_ms:.2f} 毫秒"
print(result)
return result
if __name__ == '__main__':
data = open("1726035525.5445561.test.txt","r",encoding='utf-8').readlines()
fw = open("ct_result.txt","w",encoding='utf-8')
for d in data:
d = d.rstrip()
result = predict(d)
fw.write(result+"\n")
fw.close()
\ No newline at end of file
import json import json
import re import torch
# import string
# zh_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." class CTTokenizer:
# en_punct = string.punctuation
# punct_list = zh_punct + en_punct
class Tokenizer:
def __init__(self, vocab_path): def __init__(self, vocab_path):
self.vocab = self.load_vocab(vocab_path) self.vocab = self.load_vocab(vocab_path)
self.vocab_set = set(self.vocab) # 使用集合加快查找
self.vocab_dict = {token: idx for idx, token in enumerate(self.vocab)} # 创建词典映射
self.sentence_start_token = '<s>' self.sentence_start_token = '<s>'
self.sentence_end_token = '</s>' self.sentence_end_token = '</s>'
self.padding_token = '<pad>' self.padding_token = '<pad>'
self.unk_token = '<unk>'
def load_vocab(self, vocab_path): def load_vocab(self, vocab_path):
with open(vocab_path, 'r', encoding='utf-8') as f: with open(vocab_path, 'r', encoding='utf-8') as f:
vocab = json.load(f) vocab = json.load(f)
return vocab return vocab
def ids2tokens(self, token_ids):
def ids2tokens(self, token_ids):
return [self.vocab[token_id] for token_id in token_ids] return [self.vocab[token_id] for token_id in token_ids]
def tokens2ids(self, tokens): def tokens2ids(self, tokens):
return [self.vocab.index(token) for token in tokens] return [self.vocab_dict.get(token, self.vocab_dict[self.unk_token]) for token in tokens]
def tokenize(self, text): def tokenize(self, text):
text = text.lower() text = text.lower()
# text = [char for char in text if char not in punct_list]
# text = ''.join(text)
# 匹配单个中文字符、英文单词和其他字符,包括空格
pattern = r'[\u4e00-\u9fa5]|[a-zA-Z]+|\s|[^a-zA-Z\s]'
tokens = re.findall(pattern, text)
# 处理英文单词、中文字符和空格
tokenize_result = [] tokenize_result = []
for token in tokens:
if token in self.vocab: # 遍历文本中的字符
if token.isalpha() and token.isascii(): current_token = []
tokenize_result.append(token) for char in text:
elif token in (chr(i) for i in range(0x4E00, 0x9FA5 + 1)): if char.isalpha() and char.isascii():
tokenize_result.append(token) current_token.append(char) # 英文字符
else: elif '\u4e00' <= char <= '\u9fa5':
tokenize_result.append(token) # 处理其他符号 if current_token:
tokenize_result.append(''.join(current_token))
current_token = []
tokenize_result.append(char) # 中文字符
elif char.isdigit(): # 处理数字
current_token.append(char)
else: else:
if not token.isspace(): if current_token:
tokenize_result.append('<unk>') tokenize_result.append(''.join(current_token))
current_token = []
if not char.isspace():
tokenize_result.append(char) # 其他符号
return tokenize_result if current_token: # 处理最后一个 token
tokenize_result.append(''.join(current_token))
def __call__(self, text, max_length=None, is_split_into_words=None): # 处理 token 结果
return [token if token in self.vocab_set else self.unk_token for token in tokenize_result]
def __call__(self, text, max_length=None, is_split_into_words=None, return_pt = None):
if is_split_into_words: if is_split_into_words:
tokens = text tokens = text
else: else:
tokens = self.tokenize(text) tokens = self.tokenize(text)
if len(tokens) > max_length - 2:
tokens = tokens[:max_length-2] # 截断
tokens = tokens[:max_length - 2] if max_length and len(tokens) > max_length - 2 else tokens
tokens = [self.sentence_start_token] + tokens + [self.sentence_end_token] tokens = [self.sentence_start_token] + tokens + [self.sentence_end_token]
attention_mask = [1] * len(tokens)
# 填充 attention_mask = [1] * len(tokens) # 创建 attention mask
padding_length = max_length - len(tokens) padding_length = (max_length - len(tokens)) if max_length else 0
tokens.extend([self.padding_token] * padding_length) tokens.extend([self.padding_token] * padding_length)
attention_mask.extend([0] * padding_length)
attention_mask.extend([0]*padding_length)
token_ids = self.tokens2ids(tokens) token_ids = self.tokens2ids(tokens)
assert len(token_ids) == len(attention_mask) assert len(token_ids) == len(attention_mask)
if return_pt:
token_ids, attention_mask = torch.tensor(token_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0)
inputs = {
"input_ids": token_ids,
"attention_mask": attention_mask
}
return inputs
return token_ids, attention_mask return token_ids, attention_mask
# if __name__ == '__main__': # if __name__ == '__main__':
# tokenizer = Tokenizer('vocab.json') # tokenizer = Tokenizer('vocab.json')
# print(len(tokenizer.vocab)) # print(len(tokenizer.vocab))
# text = "Hello, 你好!This is a test." # text = "Hello, 你好!This 2025 40 is a test."
# token_ids, attention_mask = tokenizer.encode(text, 32) # token_ids, attention_mask = tokenizer(text, 32)
# print(token_ids)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment