Commit 853d9bea by tongtao.ling

update model.py

parent 29a4e564
......@@ -2,3 +2,6 @@ __pycache__/
dataset/
output/
pretrain_data/
runs/
test_data
*.ipynb
\ No newline at end of file
......@@ -2,7 +2,7 @@
"vocab_size":272833,
"d_model": 256,
"num_attention_heads":8,
"max_position_embeddings":128,
"max_position_embeddings":256,
"hidden_dropout_prob":0.1,
"attention_probs_dropout_prob":0.1,
"layer_norm_eps":1e-12,
......
import onnx
import os
import torch
import random
import onnxruntime
import numpy as np
from pprint import pprint
from config import CTConfig
from tokenizer import CTTokenizer
from model import CTTransformerForPreTraining
from onnxruntime.quantization import quantize_dynamic, QuantType
config = CTConfig("./config.json")
tokenizer = CTTokenizer("./vocab.json")
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
num_labels = len(label_list)
config.label_list = label_list
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
model_path = "./output/finetune_output/best_checkpoint.pt"
state_dict = torch.load(model_path, weights_only=True, map_location="cuda:0")
model.load_state_dict(state_dict)
model.eval()
def generate_inputs(batch_size, max_len):
input_ids = []
for _ in range(batch_size):
seq_length = random.randint(5,10)
token_ids = np.random.randint(1,config.vocab_size,(seq_length,))
token_ids = [1] + token_ids.tolist() + [2]
token_ids = np.array(token_ids)
token_ids = np.append(token_ids,np.zeros(max_len-len(token_ids)),axis=0)
input_ids.append(torch.tensor(token_ids).long())
input_ids = torch.stack(input_ids)
attention_mask = (input_ids != 0).long()
print(input_ids)
print(attention_mask)
return input_ids, attention_mask
def export_onnx(output_dir):
# 导出模型为 ONNX 格式
batch_size = 1 #批处理大小
sequence_length = 256
dynamic_axes= {"input_ids":{0:"batch_size",1:"sequence_length"},
"attention_mask":{0:"batch_size",1:"sequence_length"},
"logits":{0:"batch_size",1:"sequence_length"},
}
token_ids, attention_mask = generate_inputs(batch_size, sequence_length)
dummy_input = {
"input_ids": token_ids,
"attention_mask":attention_mask
}
torch.onnx.export(model, dummy_input, os.path.join(output_dir,'model.onnx'),
export_params=True,
training=torch.onnx.TrainingMode.EVAL,
# opset_version=15,
do_constant_folding=True,
input_names=['input_ids','attention_mask'],
output_names=['logits'],
dynamic_axes=dynamic_axes,
verbose=True)
def quantize(model_fp32, model_quant):
quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QInt8)
def printinfo(onnx_session):
print("----------------- 输入部分 -----------------")
input_tensors = onnx_session.get_inputs() # 该 API 会返回列表
for input_tensor in input_tensors: # 因为可能有多个输入,所以为列表
input_info = {
"name" : input_tensor.name,
"type" : input_tensor.type,
"shape": input_tensor.shape,
}
pprint(input_info)
print("----------------- 输出部分 -----------------")
output_tensors = onnx_session.get_outputs() # 该 API 会返回列表
for output_tensor in output_tensors: # 因为可能有多个输出,所以为列表
output_info = {
"name" : output_tensor.name,
"type" : output_tensor.type,
"shape": output_tensor.shape,
}
pprint(output_info)
if __name__ == "__main__":
output_dir = "./output/finetune_output"
export_onnx(output_dir)
model = onnx.load(os.path.join(output_dir,"model.onnx"))
onnx.checker.check_model(model)
print(onnx.helper.printable_graph(model.graph))
ort_session = onnxruntime.InferenceSession(os.path.join(output_dir,"model.onnx"))
printinfo(ort_session)
batch_size = 1 #批处理大小
sequence_length = 256
token_ids, attention_mask = generate_inputs(batch_size, sequence_length)
outputs = ort_session.run(None, {
"input_ids": token_ids.numpy(),
"attention_mask":attention_mask.numpy(),
})
print(outputs[0].shape)
model_fp32 = os.path.join(output_dir,"model.onnx")
model_quant = os.path.join(output_dir,"model_quant.onnx")
quantize(model_fp32, model_quant)
\ No newline at end of file
......@@ -8,19 +8,14 @@ import pandas as pd
import string
import threading
from tqdm import tqdm
from tokenizer import Tokenizer
from glob import glob
from tokenizer import CTTokenizer
chinese_punctuation = ",。!?;:()【】《》“”‘’、"
punctuation_list = string.punctuation + chinese_punctuation
interregnum_list = ['吗', '吧', '呀', '呃', '呐', '呗', '呢', '呵', '哇', '哉', '哎', '哩', '唔', '唸', '啦', '啵', '嘛', '嘞', '欤', '啊', '哦', '恩', '嗯']
# interregnum_list = [token for token, pos in dt.word_tag_tab.items() if pos == "y"]
# interregnum_list.extend(["啊","哦","哎","恩","嗯"])
# interregnum_list.remove("吧")
# tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenizer = Tokenizer("vocab.json")
tokenizer = CTTokenizer("vocab.json")
def process_text(sent):
sent = sent.replace("\n","").replace("\\n","").replace("|","")
......@@ -136,9 +131,8 @@ def create_csc(data_dir, output_dir):
for split in ["train","dev","test"]:
# for split in ["test"]:
data = json.load(open(os.path.join(data_dir, f"{split}.json"),"r",encoding="utf-8"))
data = data[:100]
token_label_data = []
for item in data:
for item in tqdm(data):
# sentence = "地处武林路,相当繁华的地带啊,内部环境不错,很干净,地上的头发会当即清扫。发型师的服务态度也很好,当然价格是比较高的,"
sentence = item["correct_text"]
# sentence = process_text(sentence)
......@@ -158,12 +152,24 @@ def create_csc(data_dir, output_dir):
for i in token_label_data:
f.write(str(i)+"\n")
def create_lang_8(data_dir, output_dir):
def remove_non_chinese_english_chars(text):
# 定义要保留的字符范围
chinese_characters = r'\u4e00-\u9fff' # 基本汉字
english_characters = r'a-zA-Z'
punctuation = re.escape('''!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~,。!?、;:“”‘’《》〈〉【】〔〕()‥·°〃※》々◦〝〞〃〄〆''')
# 正则表达式模式
pattern = f'[^{chinese_characters}{english_characters}{punctuation}]+'
# 使用正则表达式替换非保留字符
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def create_lang_8(data, output_dir, output_name):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
print(f"data size: {len(data)}")
token_label_data = []
for item in tqdm(data):
sentence = item.split("\t")[-1]
......@@ -180,7 +186,7 @@ def create_lang_8(data_dir, output_dir):
# train_data = token_label_data[:split_index]
# dev_data = token_label_data[split_index:]
with open(os.path.join(output_dir, "data.txt"),"w",encoding="utf-8") as f:
with open(os.path.join(output_dir, f"{output_name}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
# with open(os.path.join(output_dir, "dev.txt"),"w",encoding="utf-8") as f:
......@@ -201,27 +207,110 @@ def create_lang_8(data_dir, output_dir):
# for i in test_data:
# f.write(str(i)+"\n")
# class DatasetThread(threading.Thread): #继承父类threading.Thread
# def __init__(self, threadID, name, counter):
# threading.Thread.__init__(self)
# self.threadID = threadID
# self.name = name
# self.counter = counter
# def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
# print("Starting:",self.name)
# create_csc("./csc", "./data")
# print "Exiting " + self.name
def create_baike(data, output_dir, output_name):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
token_label_data = []
for item in tqdm(data):
item = eval(item)
answer = item["answer"]
answer = answer.replace("\r","").replace("\n","")
sentence = process_text(answer)
token_label = create_repetitions(sentence)
if token_label == None:
continue
token_label_data.append(token_label)
with open(os.path.join(output_dir, f"{output_name}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
def split_list(lst, n):
"""
将列表 lst 切分成 n 份,如果不能整除则最后一份可能会稍短。
:param lst: 待切分的列表
:param n: 切分的份数
:return: 切分后的列表集合
"""
# 计算每一份的长度
k, m = divmod(len(lst), n)
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def create_wiki(data_dir, output_dir, output_name):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
token_label_data = []
files_dir = glob(data_dir+"/*")
for file in tqdm(files_dir):
lines = open(file,"r",encoding="utf-8").readlines()
for line in lines:
line = eval(line)
text = line["text"]
docs = text.split("\n\n")
# print(docs)
sentences = []
for doc in docs:
# print(doc)
if len(doc) < 10:
continue
doc = remove_non_chinese_english_chars(doc)
doc = doc.replace("\n","").replace(" ","")
doc = doc.lstrip()
doc = doc.rstrip()
sentences.append(doc)
for sentence in sentences:
# print(sentence)
token_label = create_repetitions(sentence)
if token_label == None:
continue
token_label_data.append(token_label)
with open(os.path.join(output_dir, f"{output_name}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
class DatasetThread(threading.Thread): #继承父类threading.Thread
def __init__(self, threadID, name, data_dir, output_dir):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.output_dir = output_dir
self.data_dir = data_dir
def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
print("Starting:",self.name)
# create_baike(self.data, self.output_dir, self.name)
create_wiki(self.data_dir, output_dir, self.name)
print("Exiting:",self.name)
if __name__ == "__main__":
create_lang_8("./lang-8", "./pretrain_data")
# create_csc("./csc", "./data")
# 创建两个线程
# try:
# thread.start_new_thread( print_time, ("Thread-1", 2, ) )
# thread.start_new_thread( print_time, ("Thread-2", 4, ) )
# except:
# print "Error: unable to start thread"
# while 1:
# pass
# data_dir = "./dataset/baike2018qa/baike_qa_train.json"
# data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
# data = pd.read_csv(data_dir)
# data = open(data_dir,"r",encoding="utf-8").readlines()
# print(f"data size: {len(data)}")
wiki_dir = "./dataset/wiki_zh"
wiki_files = os.listdir(wiki_dir)
thread_num = len(wiki_files)
output_dir = "./pretrain_data"
# print(wiki_files[0])
# create_wiki(os.path.join(wiki_dir,wiki_files[0]), output_dir, wiki_files[0])
# dataset = split_list(data, thread_num)
output_dir = "./pretrain_data"
for i in range(thread_num):
thread = DatasetThread(i+1, wiki_files[i], os.path.join(wiki_dir,wiki_files[i]), output_dir)
thread.start()
# create_csc("./dataset/csc", "./dataset/finetune_data")
import os
import logging
import torch
from torch import nn
from transformers import AdamW, get_linear_schedule_with_warmup
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer
from tokenizer import CTTokenizer
from config import CTConfig
from model import CTTransformerForPreTraining
......@@ -16,6 +15,8 @@ import argparse
from tqdm import tqdm, trange
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/finetune')
logger = logging.getLogger(__name__)
......@@ -33,47 +34,55 @@ class CTDataset(Dataset):
def __init__(self, config, data_dir, tokenizer, max_length):
self.data = open(data_dir,"r",encoding="utf-8").readlines()
self.texts, self.labels = self.read_data(self.data)
self.tokens, self.labels = self.read_data(self.data)
# self.label_list = label_list
# self.label2id = label2id
self.id2label = id2label
self.label2id = label2id
# self.id2label = id2label
# self.num_labels = num_labels
self.tokenizer = tokenizer
self.max_length = max_length
self.pad_label_id = config.pad_label_id
self.pad_token_id = config.pad_token_id # 0
self.pad_label_id = config.pad_label_id # -100
def read_data(self, data):
texts, labels = [], []
for sent in data:
tokens, labels = [], []
for sent in tqdm(data):
sent = eval(sent)
text, label = [], []
for token_label in sent:
token, tag = token_label.split("|")
text.append(token)
label.append(tag)
texts.append(text)
tokens.append(text)
labels.append(label)
return texts, labels
return tokens, labels
def __len__(self):
return len(self.texts)
return len(self.tokens)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
assert len(text) == len(label)
token_ids, attention_mask = self.tokenizer(
text,
# padding='max_length',
# truncation=True,
max_length=self.max_length,
is_split_into_words=True,
# return_tensors='pt'
)
label_ids = [ label2id[i] for i in label]
tokens = self.tokens[idx]
labels = self.labels[idx]
assert len(tokens) == len(labels)
tokens = tokens[:self.max_length - 2] if len(tokens) > self.max_length - 2 else tokens
labels = labels[:self.max_length - 2] if len(labels) > self.max_length - 2 else labels
label_ids = [ self.label2id[i] for i in labels]
tokens = [self.tokenizer.sentence_start_token] + tokens + [self.tokenizer.sentence_end_token]
label_ids = [self.pad_label_id] + label_ids + [self.pad_label_id]
attention_mask = [1] * len(tokens)
padding_length = self.max_length - len(tokens)
tokens.extend([self.tokenizer.padding_token] * padding_length)
attention_mask.extend([0] * padding_length)
token_ids = self.tokenizer.tokens2ids(tokens)
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
......@@ -86,22 +95,6 @@ class CTDataset(Dataset):
return inputs
# def compute_metrics(predictions, labels):
# true_predictions = [
# [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# true_labels = [
# [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# precision=precision_score(true_labels, true_predictions, average='micro')
# recall=recall_score(true_labels, true_predictions, average='micro')
# f1=f1_score(true_labels, true_predictions, average='micro')
# return precision, recall, f1
def train(args, model, train_dataset, dev_dataset):
args.train_batch_size = args.per_gpu_train_batch_size
......@@ -160,7 +153,7 @@ def train(args, model, train_dataset, dev_dataset):
outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0]
writer.add_scalar('train_loss', loss, global_step)
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
......@@ -176,12 +169,15 @@ def train(args, model, train_dataset, dev_dataset):
global_step += 1
if args.logging_steps > 0 and global_step % args.logging_steps == 0:
if args.evaluate_during_training:
f1, _, _ = evaluate(args, model, dev_dataset)
f1, eval_loss, _ = evaluate(args, model, dev_dataset)
writer.add_scalar('f1', f1, global_step)
writer.add_scalar('eval_loss', eval_loss, global_step)
if best_f1 < f1:
best_f1 = f1
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/best_checkpoint.pt"
save_dir = os.path.join(args.output_dir, "best_checkpoint.pt")
torch.save(model.state_dict(), save_dir)
logger.info("Saving best checkpoint to %s", save_dir)
......@@ -192,7 +188,7 @@ def train(args, model, train_dataset, dev_dataset):
best_f1 = f1
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/best_checkpoint.pt"
save_dir = os.path.join(args.output_dir, "best_checkpoint.pt")
torch.save(model.state_dict(), save_dir)
logger.info("Saving best checkpoint to %s", save_dir)
......@@ -280,11 +276,11 @@ def set_seed(args):
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", default="./data", type=str)
parser.add_argument("--model_path", default="./outputs/model.pt", type=str)
parser.add_argument("--data_dir", default="./dataset/finetune_data", type=str)
parser.add_argument("--model_path", default="./output/pretrain_output/model.pt", type=str)
parser.add_argument("--output_dir", default='./ft_outputs/', type=str)
parser.add_argument("--max_seq_length", default=128, type=int)
parser.add_argument("--output_dir", default='./output/finetune_output/', type=str)
parser.add_argument("--max_seq_length", default=256, type=int)
parser.add_argument("--do_train", action="store_true",
help="Whether to run training.")
......@@ -295,9 +291,9 @@ def parse_args():
help="Whether to run evaluation during training at each logging step.")
parser.add_argument("--evaluate_after_epoch", action="store_true",
help="Whether to run evaluation after each epoch.")
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
parser.add_argument("--per_gpu_train_batch_size", default=64, type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int,
help="Batch size per GPU/CPU for evaluation.")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
......@@ -334,7 +330,7 @@ if __name__ == "__main__":
if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}")
os.system(f"rm -rf ./run/finetune")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.device = device
......@@ -347,9 +343,10 @@ if __name__ == "__main__":
config = CTConfig('config.json')
# 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json')
tokenizer = CTTokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
# label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM"]
label2id = {j:i for i,j in enumerate(label_list)}
id2label = {i:j for i,j in enumerate(label_list)}
......@@ -380,7 +377,7 @@ if __name__ == "__main__":
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/last_checkpoint.pt"
save_dir = os.path.join(args.output_dir, "last_checkpoint.pt")
torch.save(model.state_dict(), save_dir)
logger.info("Saving last checkpoint to %s", save_dir)
......@@ -396,5 +393,5 @@ if __name__ == "__main__":
f1, _, overall_result = evaluate(args, model, test_dataset)
output_eval_file = os.path.join(args.output_dir, "test_results.txt")
with open(output_eval_file, "a") as writer:
writer.write('***** Predict in test dataset *****')
writer.write('***** Predict in test dataset ***** \n')
writer.write("{} \n".format(overall_result))
\ No newline at end of file
#!/bin/bash
export CUDA_VISIBLE_DEVICES=3
OUTPUT_DIR='./output/finetune_output'
MODEL_PATH='./output/pretrain_output/model.pt'
CUDA_VISIBLE_DEVICES='2' python fintune.py \
MODEL_PATH='./output/model.pt'
python finetune.py \
--do_train \
--do_eval \
--model_path $MODEL_PATH \
--output_dir $OUTPUT_DIR \
--evaluate_after_epoch \
--per_gpu_train_batch_size 42 \
--per_gpu_eval_batch_size 42 \
--per_gpu_train_batch_size 64 \
--per_gpu_eval_batch_size 64 \
--dropout_prob 0.1 \
--max_seq_length 256 \
--learning_rate 3e-5 \
--weight_decay 5e-5 \
--num_train_epochs 10 \
--logging_steps 0.5 \
--seed 42
\ No newline at end of file
......@@ -11,14 +11,12 @@ class CTEmbeddings(nn.Module):
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id)
# self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
self.register_buffer('position_embeddings', self._get_sinusoid_encoding_table(config.max_position_embeddings, config.d_model))
self.LayerNorm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def _get_sinusoid_encoding_table(self, n_position, d_hid):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def get_position_angle_vec(position):
# this part calculate the position In brackets
......@@ -36,8 +34,6 @@ class CTEmbeddings(nn.Module):
input_ids: torch.Tensor,
) -> torch.Tensor:
word_embeddings = self.word_embeddings(input_ids)
# position_embeddings = self.position_embeddings(position_ids)
# embeddings = word_embeddings + position_embeddings
embeddings = word_embeddings + self.position_embeddings[:, :word_embeddings.size(1)]
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
......@@ -56,9 +52,21 @@ class CTSelfAttention(nn.Module):
self.value = nn.Linear(config.d_model, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def generate_ct_mask(self, batch_size:Optional[int], seq_len:Optional[int]):
def my_triu(self, x: torch.tensor, diagonal: int):
l = x.size(-1)
arange = torch.arange(l)
mask = arange.expand(l,l)
mask = mask-diagonal
arange = arange.unsqueeze(-1)
mask = torch.le(mask, arange)
return mask
def generate_ct_mask(self, batch_size: int, seq_len: int):
mask = torch.ones(seq_len, seq_len, dtype=torch.bool)
mask = torch.triu(mask, diagonal=self.max_future_length)
# mask = torch.triu(mask, diagonal=self.max_future_length)
mask = self.my_triu(mask, diagonal=self.max_future_length)
mask = torch.stack([mask] * batch_size)
mask = mask.reshape(batch_size,1,seq_len,seq_len)
return mask
......@@ -69,7 +77,7 @@ class CTSelfAttention(nn.Module):
return x.permute(0, 2, 1, 3)
def expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
def expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
......@@ -96,16 +104,19 @@ class CTSelfAttention(nn.Module):
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention = attention.masked_fill(attention_mask == 0, float("-1e20"))
ct_mask = ct_mask.to(attention_mask.device)
attention = attention.masked_fill(ct_mask == 0, float("-1e20"))
attention = attention.masked_fill(attention_mask == 0, float("-1e20"))
attention = attention / math.sqrt(self.attention_head_size)
attention_probs = F.softmax(attention, dim=-1)
attention_probs = F.softmax(attention, dim=-1)
attention_probs = self.dropout(attention_probs)
# context_layer即attention矩阵与value矩阵的乘积,原始的大小为:(batch_size, num_attention_heads, sequence_length, attention_head_size) ;
context_layer = torch.matmul(attention_probs, value_layer)
# context_layer进行转置和view操作以后,形状就恢复了(batch_size, sequence_length, hidden_size)。
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
......@@ -132,6 +143,7 @@ class CTLayer(nn.Module):
# feed forward block
def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
# FFN(x) = max(0, x*W_1+ b_1)*W_2 + b_2
x = self.linear2(self.dropout(self.activation(self.linear1(x))))
return self.dropout2(x)
......@@ -150,8 +162,10 @@ class CTLayer(nn.Module):
x = hidden_states
# 残差
x = self.norm1(x + self.dropout1(attention_scores))
# FeedForward
x = self.norm2(x + self._ff_block(x))
return x
......@@ -198,18 +212,11 @@ class CTTransformer(nn.Module):
attention_mask: torch.Tensor,
) -> torch.Tensor:
# batch_size, seq_len = input_ids.size()
# position_ids = self.generate_position_ids(batch_size, seq_len)
# device = input_ids.device
embedding_output = self.embeddings(
input_ids=input_ids,
# position_ids=position_ids,
)
embedding_output = self.embeddings(input_ids)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=attention_mask,
attention_mask,
)
last_hidden_state = encoder_outputs[0]
......@@ -233,7 +240,7 @@ class CTTransformerForPreTraining(nn.Module):
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
labels: torch.Tensor,
labels: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor]:
last_hidden_state = self.ct_tranformer(input_ids, attention_mask)
......
import os
import logging
import torch
from torch import nn
from torch.utils.data.distributed import DistributedSampler
from torch.utils.tensorboard import SummaryWriter
from torch.optim import Adam
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer
from tokenizer import CTTokenizer
from config import CTConfig
from model import CTTransformerForPreTraining
from transformers import get_linear_schedule_with_warmup
import numpy as np
import random
import argparse
from glob import glob
from tqdm import tqdm, trange
writer = SummaryWriter('runs/pretrain')
logger = logging.getLogger(__name__)
......@@ -32,8 +35,7 @@ def collate_fn(batch):
class CTDataset(Dataset):
def __init__(self, config, tokenizer, max_length):
self.data = open(config.train_data,"r",encoding="utf-8").readlines()
self.texts, self.labels = self.read_data(self.data)
self.tokens, self.labels = self.read_data(config.data_dir)
self.label_list = label_list
self.label2id = label2id
......@@ -44,36 +46,50 @@ class CTDataset(Dataset):
self.max_length = max_length
self.pad_label_id = config.pad_label_id
def read_data(self, data):
def read_data(self, data_dir):
data_files = glob(os.path.join(data_dir,"*.txt"))
texts, labels = [], []
for sent in data:
sent = eval(sent)
for file in data_files:
lines = open(file,"r",encoding="utf-8").readlines()
logger.info(f"Loading {file} data!")
for line in tqdm(lines):
sent = eval(line)
text, label = [], []
for token_label in sent:
if token_label.count("|") > 1:
continue
token, tag = token_label.split("|")
text.append(token)
label.append(tag)
texts.append(text)
labels.append(label)
assert len(texts) == len(labels)
return texts, labels
def __len__(self):
return len(self.texts)
return len(self.tokens)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
assert len(text) == len(label)
token_ids, attention_mask = self.tokenizer(
text,
# padding='max_length',
# truncation=True,
max_length=self.max_length,
is_split_into_words=True,
# return_tensors='pt'
)
label_ids = [ label2id[i] for i in label]
tokens = self.tokens[idx]
labels = self.labels[idx]
tokens = tokens[:self.max_length - 2] if len(tokens) > self.max_length - 2 else tokens
labels = labels[:self.max_length - 2] if len(labels) > self.max_length - 2 else labels
label_ids = [ self.label2id[i] for i in labels]
tokens = [self.tokenizer.sentence_start_token] + tokens + [self.tokenizer.sentence_end_token]
label_ids = [self.pad_label_id] + label_ids + [self.pad_label_id]
attention_mask = [1] * len(tokens)
padding_length = self.max_length - len(tokens)
tokens.extend([self.tokenizer.padding_token] * padding_length)
attention_mask.extend([0] * padding_length)
token_ids = self.tokenizer.tokens2ids(tokens)
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
......@@ -103,9 +119,18 @@ def train(args, model, train_dataset):
# 初始化优化器和学习率调度器
optimizer = Adam(model.parameters(), lr=args.lr)
# total_steps = len(train_dataloader) // args.epochs
# t_total = len(train_dataloader) // args.epochs
# scheduler = get_linear_schedule_with_warmup(
# optimizer, num_warmup_steps=0, num_training_steps=t_total
# )
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.epochs)
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_batch_size)
# logger.info(" Total optimization steps = %d", t_total)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
training_loss = 0
global_step = 0
......@@ -124,6 +149,8 @@ def train(args, model, train_dataset):
outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0]
writer.add_scalar('Loss', loss, epoch)
epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6)))
if args.n_gpu > 1:
......@@ -132,9 +159,8 @@ def train(args, model, train_dataset):
loss.backward()
training_loss += loss.item()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# scheduler.step()
optimizer.step()
# scheduler.step()
model.zero_grad()
global_step += 1
......@@ -180,8 +206,8 @@ def parse_args():
parser.add_argument("--epochs", default=3, type=int)
parser.add_argument("--lr", default=5e-5, type=float)
parser.add_argument("--per_gpu_batch_size", default=8, type=int)
parser.add_argument("--max_length", default=128, type=int)
parser.add_argument("--per_gpu_batch_size", default=600, type=int)
parser.add_argument("--max_length", default=256, type=int)
args = parser.parse_args()
return args
......@@ -194,6 +220,7 @@ if __name__ == "__main__":
if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}")
os.system("rm -rf ./run/pretrain")
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
......@@ -216,7 +243,7 @@ if __name__ == "__main__":
config = CTConfig('config.json')
# 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json')
tokenizer = CTTokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
......@@ -232,8 +259,13 @@ if __name__ == "__main__":
model = CTTransformerForPreTraining(config)
model.to(args.device)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f'Total number of parameters: {num_params}')
config.train_data = os.path.join(args.data_dir,'data.txt')
# config.data = open(args.data_dir,"r",encoding='utf-8').readlines()
config.data_dir = args.data_dir
pretrain_dataset = CTDataset(config, tokenizer, args.max_length)
train(args, model, pretrain_dataset)
writer.close()
CUDA_VISIBLE_DEVICES="4,5,6,7" python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \
--epochs 20 \
--per_gpu_batch_size 64 \
--lr 5e-5 \
--output_dir ./outputs \
export CUDA_VISIBLE_DEVICES="4,5,6,7"
python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \
--data_dir ./pretrain_data \
--epochs 10 \
--per_gpu_batch_size 500 \
--lr 1e-4 \
--output_dir ./output/pretrain_output \
--max_length 256 \
--seed 2024
\ No newline at end of file
import json
import os
import glob
import time
import torch
from tqdm import tqdm
from seqeval.metrics import accuracy_score, f1_score, classification_report
from config import CTConfig
from tokenizer import CTTokenizer
from model import CTTransformerForPreTraining
config = CTConfig("./config.json")
tokenizer = CTTokenizer("./vocab.json")
model_path = "./output/finetune_output/best_checkpoint.pt"
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
num_labels = len(label_list)
config.label_list = label_list
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
state_dict = torch.load(model_path, weights_only=True)
model.load_state_dict(state_dict)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
model.eval()
def predict(text):
start_time = time.perf_counter()
max_length=256
# text = "啊嗯对哦,有蓝色有蓝的那个哦,有logo是吧?Logo对花花式咖啡啡"
tokens = tokenizer.tokenize(text)
tokens = tokens[:max_length - 2] if len(tokens) > max_length - 2 else tokens
inputs = tokenizer(text, max_length=max_length, return_pt=True)
output = model(**inputs)
logits = output[1]
preds = logits.argmax(-1).view(-1).tolist()
preds = preds[1:len(tokens)+1]
# print(preds)
result = []
# print(len(tokens))
# print(len(preds))
assert len(tokens) == len(preds)
for token,pred in zip(tokens, preds):
if pred in [0,5,6]:
result.append(token)
# print(''.join(result))
end_time = time.perf_counter()
# 计算运行时间(秒)并转换为毫秒
elapsed_time_ms = (end_time - start_time) * 1000
result = f"文本:{text}, 结果:{''.join(result)}, 运行时间: {elapsed_time_ms:.2f} 毫秒"
print(result)
return result
if __name__ == '__main__':
data = open("1726035525.5445561.test.txt","r",encoding='utf-8').readlines()
fw = open("ct_result.txt","w",encoding='utf-8')
for d in data:
d = d.rstrip()
result = predict(d)
fw.write(result+"\n")
fw.close()
\ No newline at end of file
import json
import re
# import string
import torch
# zh_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
# en_punct = string.punctuation
# punct_list = zh_punct + en_punct
class Tokenizer:
class CTTokenizer:
def __init__(self, vocab_path):
self.vocab = self.load_vocab(vocab_path)
self.vocab_set = set(self.vocab) # 使用集合加快查找
self.vocab_dict = {token: idx for idx, token in enumerate(self.vocab)} # 创建词典映射
self.sentence_start_token = '<s>'
self.sentence_end_token = '</s>'
self.padding_token = '<pad>'
self.unk_token = '<unk>'
def load_vocab(self, vocab_path):
with open(vocab_path, 'r', encoding='utf-8') as f:
......@@ -19,66 +17,70 @@ class Tokenizer:
return vocab
def ids2tokens(self, token_ids):
return [self.vocab[token_id] for token_id in token_ids]
def tokens2ids(self, tokens):
return [self.vocab.index(token) for token in tokens]
return [self.vocab_dict.get(token, self.vocab_dict[self.unk_token]) for token in tokens]
def tokenize(self, text):
text = text.lower()
# text = [char for char in text if char not in punct_list]
# text = ''.join(text)
# 匹配单个中文字符、英文单词和其他字符,包括空格
pattern = r'[\u4e00-\u9fa5]|[a-zA-Z]+|\s|[^a-zA-Z\s]'
tokens = re.findall(pattern, text)
# 处理英文单词、中文字符和空格
tokenize_result = []
for token in tokens:
if token in self.vocab:
if token.isalpha() and token.isascii():
tokenize_result.append(token)
elif token in (chr(i) for i in range(0x4E00, 0x9FA5 + 1)):
tokenize_result.append(token)
else:
tokenize_result.append(token) # 处理其他符号
# 遍历文本中的字符
current_token = []
for char in text:
if char.isalpha() and char.isascii():
current_token.append(char) # 英文字符
elif '\u4e00' <= char <= '\u9fa5':
if current_token:
tokenize_result.append(''.join(current_token))
current_token = []
tokenize_result.append(char) # 中文字符
elif char.isdigit(): # 处理数字
current_token.append(char)
else:
if not token.isspace():
tokenize_result.append('<unk>')
if current_token:
tokenize_result.append(''.join(current_token))
current_token = []
if not char.isspace():
tokenize_result.append(char) # 其他符号
return tokenize_result
if current_token: # 处理最后一个 token
tokenize_result.append(''.join(current_token))
def __call__(self, text, max_length=None, is_split_into_words=None):
# 处理 token 结果
return [token if token in self.vocab_set else self.unk_token for token in tokenize_result]
def __call__(self, text, max_length=None, is_split_into_words=None, return_pt = None):
if is_split_into_words:
tokens = text
else:
tokens = self.tokenize(text)
if len(tokens) > max_length - 2:
tokens = tokens[:max_length-2] # 截断
tokens = tokens[:max_length - 2] if max_length and len(tokens) > max_length - 2 else tokens
tokens = [self.sentence_start_token] + tokens + [self.sentence_end_token]
attention_mask = [1] * len(tokens)
# 填充
padding_length = max_length - len(tokens)
attention_mask = [1] * len(tokens) # 创建 attention mask
padding_length = (max_length - len(tokens)) if max_length else 0
tokens.extend([self.padding_token] * padding_length)
attention_mask.extend([0]*padding_length)
attention_mask.extend([0] * padding_length)
token_ids = self.tokens2ids(tokens)
assert len(token_ids) == len(attention_mask)
if return_pt:
token_ids, attention_mask = torch.tensor(token_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0)
inputs = {
"input_ids": token_ids,
"attention_mask": attention_mask
}
return inputs
return token_ids, attention_mask
# if __name__ == '__main__':
# tokenizer = Tokenizer('vocab.json')
# print(len(tokenizer.vocab))
# text = "Hello, 你好!This is a test."
# token_ids, attention_mask = tokenizer.encode(text, 32)
# text = "Hello, 你好!This 2025 40 is a test."
# token_ids, attention_mask = tokenizer(text, 32)
# print(token_ids)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment