Commit 29a4e564 by tongtao.ling

Initial commit

parents
__pycache__/
dataset/
output/
pretrain_data/
\ No newline at end of file
{
"vocab_size":272833,
"d_model": 256,
"num_attention_heads":8,
"max_position_embeddings":128,
"hidden_dropout_prob":0.1,
"attention_probs_dropout_prob":0.1,
"layer_norm_eps":1e-12,
"pad_token_id":0,
"pad_label_id":-100,
"intermediate_size":1024,
"num_hidden_layers": 4,
"L_N":9
}
\ No newline at end of file
import json
class CTConfig:
def __init__(self, json_file):
with open(json_file, 'r') as file:
self._config_data = json.load(file)
# 将字典转化为对象属性
for key, value in self._config_data.items():
setattr(self, key, value)
def __getattr__(self, item):
if item in self._config_data:
return self._config_data[item]
raise AttributeError(f"'Config' object has no attribute '{item}'")
\ No newline at end of file
import random
import json
import jieba
import re
import time
import os
import pandas as pd
import string
import threading
from tqdm import tqdm
from tokenizer import Tokenizer
chinese_punctuation = ",。!?;:()【】《》“”‘’、"
punctuation_list = string.punctuation + chinese_punctuation
interregnum_list = ['吗', '吧', '呀', '呃', '呐', '呗', '呢', '呵', '哇', '哉', '哎', '哩', '唔', '唸', '啦', '啵', '嘛', '嘞', '欤', '啊', '哦', '恩', '嗯']
# interregnum_list = [token for token, pos in dt.word_tag_tab.items() if pos == "y"]
# interregnum_list.extend(["啊","哦","哎","恩","嗯"])
# interregnum_list.remove("吧")
# tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenizer = Tokenizer("vocab.json")
def process_text(sent):
sent = sent.replace("\n","").replace("\\n","").replace("|","")
sent = sent.replace("\u3000","").replace("\u2006","").replace("\u200e","").replace("\u200b","").replace(u"\xa0","").replace("\uf04a","").replace(u"\xad","").replace(u"\ufeff","")
sent = remove_special_chars(sent)
return sent
def remove_special_chars(input_string):
# 匹配中英文字符和空格
return re.sub(r'[^\w\s\u4e00-\u9fa5]+', '', input_string)
def create_repetitions(sentence):
k_rm = random.randint(1,3)
k_im = random.randint(1,2)
k = k_rm + k_im
if not sentence:
return None
# raise TypeError(
# """A 'NoneType' object received while a 'str' object is required."""
# )
else:
words = jieba.lcut(sentence)
words = list(filter(None, words))
# words = [word for word in words if word not in [' ','']]
# assert '' not in words and ' ' not in words
# words = tokenizer.tokenize(sentence)
words_without_rm_im = [ word for word in words if word not in punctuation_list and word not in interregnum_list ]
if len(words_without_rm_im) < k:
return None
indices = []
all_indexes = list(range(len(words)))
random.shuffle(all_indexes)
count = 0
idx = 0
while count < k:
if words[all_indexes[idx]] not in punctuation_list and words[all_indexes[idx]] not in interregnum_list:
indices.append(all_indexes[idx])
count+=1
idx+=1
assert len(indices) == k
# selected_words = [words[i] for i in indices]
# print(f"selected_words:{selected_words}")
rm_indices = indices[:k_rm]
im_indices = indices[k_rm:]
tokens = []
labels = []
for idx,word in enumerate(words):
if word == ' ':
continue
if idx in rm_indices:
if len(word) == 1:
options = [1, 2, 3]
weights = [0.4, 0.35, 0.25]
# elif len(word) == 2:
# options = [1, 2, 3]
# weights = [0.4, 0.35, 0.25]
else:
options = [1, 2]
weights = [0.5, 0.5]
choices = random.choices(options, weights=weights, k=1)[0]
if word.isalpha() and word.isascii():
new_word = [ word ] * choices
new_word = ' '.join(new_word)
else:
new_word = word * choices
# start = time.time()
new_word = tokenizer.tokenize(new_word)
# end = time.time()
# print("tokenize时间:%.2f秒"%(end-start))
tokens.extend(new_word)
labels.append("B-RM")
labels.extend(["I-RM"]*(len(new_word)-1))
word = tokenizer.tokenize(word)
tokens.extend(word)
labels.append("B-RP")
labels.extend(["I-RP"]*(len(word)-1))
# print(f"words:{words}")
# print(f"word:{word}")
# print(tokens)
# print(labels)
assert len(tokens) == len(labels)
elif idx in im_indices:
word = tokenizer.tokenize(word)
tokens.extend(word)
labels.extend(["O"]*len(word))
im_word = random.choice(interregnum_list)
im_word = tokenizer.tokenize(im_word)
tokens.extend(im_word)
labels.append("B-IM")
labels.extend(["I-IM"]*(len(im_word)-1))
assert len(tokens) == len(labels)
else:
word = tokenizer.tokenize(word)
tokens.extend(word)
labels.extend(["O"]*len(word))
assert len(tokens) == len(labels)
assert len(tokens) == len(labels)
token_label = [ token + "|" + label for token,label in zip(tokens,labels)]
# print(token_label)
return token_label
def create_csc(data_dir, output_dir):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
for split in ["train","dev","test"]:
# for split in ["test"]:
data = json.load(open(os.path.join(data_dir, f"{split}.json"),"r",encoding="utf-8"))
data = data[:100]
token_label_data = []
for item in data:
# sentence = "地处武林路,相当繁华的地带啊,内部环境不错,很干净,地上的头发会当即清扫。发型师的服务态度也很好,当然价格是比较高的,"
sentence = item["correct_text"]
# sentence = process_text(sentence)
if item["wrong_ids"] == []:
token_label = [ token+"|O" for token in sentence]
else:
# start = time.time()
token_label = create_repetitions(sentence)
# end = time.time()
# print("create_repetitions时间:%.2f秒"%(end-start))
if token_label == None:
continue
token_label_data.append(token_label)
random.shuffle(token_label_data)
with open(os.path.join(output_dir, f"{split}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
def create_lang_8(data_dir, output_dir):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
print(f"data size: {len(data)}")
token_label_data = []
for item in tqdm(data):
sentence = item.split("\t")[-1]
sentence = process_text(sentence)
if len(item.split("\t")) == 3:
token_label = [ token+"|O" for token in sentence]
else:
token_label = create_repetitions(sentence)
if token_label == None:
continue
token_label_data.append(token_label)
# random.shuffle(token_label_data)
# split_index = int(len(token_label_data)*0.95)
# train_data = token_label_data[:split_index]
# dev_data = token_label_data[split_index:]
with open(os.path.join(output_dir, "data.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
# with open(os.path.join(output_dir, "dev.txt"),"w",encoding="utf-8") as f:
# for i in dev_data:
# f.write(str(i)+"\n")
# test_dataset = open(os.path.join(data_dir,"TestData_Task2/source.txt"),"r",encoding="utf-8").readlines()
# test_data = []
# for sentence in test_dataset:
# sentence = process_text(sentence)
# token_label = create_repetitions(sentence)
# if token_label == None:
# continue
# test_data.append(token_label)
# with open(os.path.join(output_dir, "test.txt"),"w",encoding="utf-8") as f:
# for i in test_data:
# f.write(str(i)+"\n")
# class DatasetThread(threading.Thread): #继承父类threading.Thread
# def __init__(self, threadID, name, counter):
# threading.Thread.__init__(self)
# self.threadID = threadID
# self.name = name
# self.counter = counter
# def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
# print("Starting:",self.name)
# create_csc("./csc", "./data")
# print "Exiting " + self.name
if __name__ == "__main__":
create_lang_8("./lang-8", "./pretrain_data")
# create_csc("./csc", "./data")
# 创建两个线程
# try:
# thread.start_new_thread( print_time, ("Thread-1", 2, ) )
# thread.start_new_thread( print_time, ("Thread-2", 4, ) )
# except:
# print "Error: unable to start thread"
# while 1:
# pass
import os
import logging
import torch
from torch import nn
from transformers import AdamW, get_linear_schedule_with_warmup
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer
from config import CTConfig
from model import CTTransformerForPreTraining
import numpy as np
import random
import argparse
from tqdm import tqdm, trange
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
logger = logging.getLogger(__name__)
def collate_fn(batch):
new_batch = { key: [] for key in batch[0].keys()}
for b in batch:
for key in new_batch:
new_batch[key].append(torch.tensor(b[key]))
for b in new_batch:
new_batch[b] = torch.stack(new_batch[b])
return new_batch
# 定义数据集
class CTDataset(Dataset):
def __init__(self, config, data_dir, tokenizer, max_length):
self.data = open(data_dir,"r",encoding="utf-8").readlines()
self.texts, self.labels = self.read_data(self.data)
# self.label_list = label_list
# self.label2id = label2id
self.id2label = id2label
# self.num_labels = num_labels
self.tokenizer = tokenizer
self.max_length = max_length
self.pad_label_id = config.pad_label_id
def read_data(self, data):
texts, labels = [], []
for sent in data:
sent = eval(sent)
text, label = [], []
for token_label in sent:
token, tag = token_label.split("|")
text.append(token)
label.append(tag)
texts.append(text)
labels.append(label)
return texts, labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
assert len(text) == len(label)
token_ids, attention_mask = self.tokenizer(
text,
# padding='max_length',
# truncation=True,
max_length=self.max_length,
is_split_into_words=True,
# return_tensors='pt'
)
label_ids = [ label2id[i] for i in label]
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
inputs = {
"token_ids": token_ids,
"attention_mask": attention_mask,
"label_ids": label_ids
}
return inputs
# def compute_metrics(predictions, labels):
# true_predictions = [
# [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# true_labels = [
# [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
# for prediction, label in zip(predictions, labels)
# ]
# precision=precision_score(true_labels, true_predictions, average='micro')
# recall=recall_score(true_labels, true_predictions, average='micro')
# f1=f1_score(true_labels, true_predictions, average='micro')
# return precision, recall, f1
def train(args, model, train_dataset, dev_dataset):
args.train_batch_size = args.per_gpu_train_batch_size
# 采样器
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
args.logging_steps = eval(args.logging_steps)
if isinstance(args.logging_steps, float):
args.logging_steps = int(args.logging_steps * len(train_dataloader)) // args.gradient_accumulation_steps
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
)
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
best_f1 = 0.0
training_loss = 0
model.zero_grad()
epoch_iterator = trange(0, int(args.num_train_epochs), desc="Epoch")
set_seed(args)
for epoch in epoch_iterator:
pbar = tqdm(train_dataloader, desc="Training")
for step, batch in enumerate(pbar):
model.train()
token_ids = batch['token_ids'].to(args.device)
attention_mask = batch['attention_mask'].to(args.device)
label_ids = batch['label_ids'].to(args.device)
outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0]
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
loss.backward()
training_loss += loss.item()
epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6)))
if (step + 1) % args.gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
model.zero_grad()
global_step += 1
if args.logging_steps > 0 and global_step % args.logging_steps == 0:
if args.evaluate_during_training:
f1, _, _ = evaluate(args, model, dev_dataset)
if best_f1 < f1:
best_f1 = f1
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/best_checkpoint.pt"
torch.save(model.state_dict(), save_dir)
logger.info("Saving best checkpoint to %s", save_dir)
if args.evaluate_after_epoch:
f1, _, _ = evaluate(args, model, dev_dataset)
if best_f1 < f1:
best_f1 = f1
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/best_checkpoint.pt"
torch.save(model.state_dict(), save_dir)
logger.info("Saving best checkpoint to %s", save_dir)
if args.max_steps > 0 and global_step > args.max_steps:
epoch_iterator.close()
break
if args.max_steps > 0 and global_step > args.max_steps:
epoch_iterator.close()
break
return global_step, training_loss / global_step
# 定义评估函数
def evaluate(args, model, eval_dataset):
args.eval_batch_size = args.per_gpu_eval_batch_size
# 采样器
test_sampler = SequentialSampler(eval_dataset)
test_dataloader = DataLoader(eval_dataset, collate_fn=collate_fn, sampler=test_sampler, batch_size=args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
pbar = tqdm(test_dataloader, desc="Testing")
model.eval()
eval_loss = 0.0
eval_steps = 0
preds = None
trues = None
with torch.no_grad():
for batch in pbar:
# Move to device
token_ids = batch['token_ids'].to(args.device)
attention_mask = batch['attention_mask'].to(args.device)
label_ids = batch['label_ids'].to(args.device)
outputs = model(token_ids, attention_mask, label_ids)
logits = outputs[1]
tmp_eval_loss = outputs[0]
eval_loss += tmp_eval_loss.item()
eval_steps += 1
if preds is None:
preds = logits.detach().cpu().numpy()
trues = label_ids.detach().cpu().numpy()
else:
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
trues = np.append(trues, label_ids.detach().cpu().numpy(), axis=0)
preds = np.argmax(preds, axis=2)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(preds, trues)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(preds, trues)
]
precision=precision_score(true_labels, true_predictions)
recall=recall_score(true_labels, true_predictions)
f1=f1_score(true_labels, true_predictions)
overall_result = classification_report(true_labels, true_predictions)
print(overall_result)
logger.info(f"p:{precision}, r:{recall}, f1:{f1}")
return f1, eval_loss / eval_steps, overall_result
# 设置随机种子
def set_seed(args):
seed = args.seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", default="./data", type=str)
parser.add_argument("--model_path", default="./outputs/model.pt", type=str)
parser.add_argument("--output_dir", default='./ft_outputs/', type=str)
parser.add_argument("--max_seq_length", default=128, type=int)
parser.add_argument("--do_train", action="store_true",
help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true",
help="Whether to run eval on the dev set.")
parser.add_argument("--evaluate_during_training", action="store_true",
help="Whether to run evaluation during training at each logging step.")
parser.add_argument("--evaluate_after_epoch", action="store_true",
help="Whether to run evaluation after each epoch.")
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
help="Batch size per GPU/CPU for evaluation.")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--learning_rate", default=3e-5, type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--dropout_prob", default=0.1, type=float,
help="dropout_prob.")
parser.add_argument("--weight_decay", default=5e-5, type=float,
help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.")
parser.add_argument("--num_train_epochs", default=5.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1, type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument("--warmup_steps", default=0, type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument("--logging_steps", type=str, default='0.5',
help="Log every X updates steps.")
parser.add_argument("--seed", type=int, default=42,
help="random seed for initialization")
args = parser.parse_args()
return args
# 主程序
if __name__ == "__main__":
args = parse_args()
# 设置随机种子
if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.device = device
# Setup logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO)
set_seed(args)
config = CTConfig('config.json')
# 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
label2id = {j:i for i,j in enumerate(label_list)}
id2label = {i:j for i,j in enumerate(label_list)}
num_labels = len(label_list)
config.label_list = label_list
config.label2id = label2id
config.id2label = id2label
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
model.from_pretrained(args.model_path)
model.to(args.device)
train_data_path = os.path.join(args.data_dir,'train.txt')
dev_data_path = os.path.join(args.data_dir,'dev.txt')
test_data_path = os.path.join(args.data_dir,'test.txt')
train_dataset = CTDataset(config, train_data_path, tokenizer, args.max_seq_length)
dev_dataset = CTDataset(config, dev_data_path, tokenizer, args.max_seq_length)
test_dataset = CTDataset(config, test_data_path, tokenizer, args.max_seq_length)
if args.do_train:
global_step, tr_loss = train(args, model, train_dataset, dev_dataset)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
# Create output directory if needed
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
save_dir = f"{args.output_dir}/last_checkpoint.pt"
torch.save(model.state_dict(), save_dir)
logger.info("Saving last checkpoint to %s", save_dir)
# Evaluation
if args.do_eval:
checkpoint = os.path.join(args.output_dir, 'best_checkpoint.pt')
state_dict = torch.load(checkpoint, weights_only=True)
model.load_state_dict(state_dict)
model.to(args.device)
f1, _, overall_result = evaluate(args, model, test_dataset)
output_eval_file = os.path.join(args.output_dir, "test_results.txt")
with open(output_eval_file, "a") as writer:
writer.write('***** Predict in test dataset *****')
writer.write("{} \n".format(overall_result))
\ No newline at end of file
#!/bin/bash
OUTPUT_DIR='./output/finetune_output'
MODEL_PATH='./output/pretrain_output/model.pt'
CUDA_VISIBLE_DEVICES='2' python fintune.py \
--do_train \
--do_eval \
--model_path $MODEL_PATH \
--output_dir $OUTPUT_DIR \
--evaluate_after_epoch \
--per_gpu_train_batch_size 42 \
--per_gpu_eval_batch_size 42 \
--dropout_prob 0.1 \
--max_seq_length 256 \
--learning_rate 3e-5 \
--weight_decay 5e-5 \
--num_train_epochs 10 \
--seed 42
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
from typing import Optional, Tuple
class CTEmbeddings(nn.Module):
"""Construct the embeddings from word and position embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id)
# self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
self.register_buffer('position_embeddings', self._get_sinusoid_encoding_table(config.max_position_embeddings, config.d_model))
self.LayerNorm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def _get_sinusoid_encoding_table(self, n_position, d_hid):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def get_position_angle_vec(position):
# this part calculate the position In brackets
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
# [:, 0::2] are all even subscripts, is dim_2i
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
def forward(
self,
input_ids: torch.Tensor,
) -> torch.Tensor:
word_embeddings = self.word_embeddings(input_ids)
# position_embeddings = self.position_embeddings(position_ids)
# embeddings = word_embeddings + position_embeddings
embeddings = word_embeddings + self.position_embeddings[:, :word_embeddings.size(1)]
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class CTSelfAttention(nn.Module):
def __init__(self, config, max_future_length):
super().__init__()
self.max_future_length = max_future_length
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.d_model / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.d_model, self.all_head_size)
self.key = nn.Linear(config.d_model, self.all_head_size)
self.value = nn.Linear(config.d_model, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def generate_ct_mask(self, batch_size:Optional[int], seq_len:Optional[int]):
mask = torch.ones(seq_len, seq_len, dtype=torch.bool)
mask = torch.triu(mask, diagonal=self.max_future_length)
mask = torch.stack([mask] * batch_size)
mask = mask.reshape(batch_size,1,seq_len,seq_len)
return mask
def transpose_for_scores(self, x: torch.Tensor):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
def expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
# inverted_mask = 1.0 - expanded_mask
# return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
return expanded_mask
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
) -> Tuple[torch.Tensor]:
batch_size, seq_length = hidden_states.shape[:2]
ct_mask = self.generate_ct_mask(batch_size, seq_length)
attention_mask = self.expand_mask(attention_mask, attention_mask.dtype)
query_layer = self.transpose_for_scores(self.query(hidden_states))
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
attention = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention = attention.masked_fill(attention_mask == 0, float("-1e20"))
ct_mask = ct_mask.to(attention_mask.device)
attention = attention.masked_fill(ct_mask == 0, float("-1e20"))
attention = attention / math.sqrt(self.attention_head_size)
attention_probs = F.softmax(attention, dim=-1)
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs)
return outputs
class CTLayer(nn.Module):
def __init__(self, config, max_future_length):
super().__init__()
self.attention = CTSelfAttention(config, max_future_length)
self.linear1 = nn.Linear(config.d_model, config.intermediate_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.linear2 = nn.Linear(config.intermediate_size, config.d_model)
self.norm1 = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.norm2 = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
self.dropout1 = nn.Dropout(config.hidden_dropout_prob)
self.dropout2 = nn.Dropout(config.hidden_dropout_prob)
self.activation = torch.nn.GELU()
# feed forward block
def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
x = self.linear2(self.dropout(self.activation(self.linear1(x))))
return self.dropout2(x)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
) -> torch.Tensor:
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
)
attention_scores = self_attention_outputs[0]
x = hidden_states
x = self.norm1(x + self.dropout1(attention_scores))
x = self.norm2(x + self._ff_block(x))
return x
class CTEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.layer_1 = [CTLayer(config,0) for _ in range(config.num_hidden_layers-1)]
self.layer_2 = [CTLayer(config,config.L_N)]
self.layers = nn.ModuleList(self.layer_1 + self.layer_2)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor
) -> Tuple[torch.Tensor]:
all_hidden_states = ()
for layer_module in self.layers:
layer_outputs = layer_module(
hidden_states,
attention_mask,
)
hidden_states = layer_outputs
all_hidden_states = all_hidden_states + (hidden_states,)
last_hidden_state = hidden_states
output = (last_hidden_state, all_hidden_states)
return output
class CTTransformer(nn.Module):
def __init__(self, config):
super().__init__()
self.embeddings = CTEmbeddings(config)
self.encoder = CTEncoder(config)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
) -> torch.Tensor:
# batch_size, seq_len = input_ids.size()
# position_ids = self.generate_position_ids(batch_size, seq_len)
# device = input_ids.device
embedding_output = self.embeddings(
input_ids=input_ids,
# position_ids=position_ids,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=attention_mask,
)
last_hidden_state = encoder_outputs[0]
return last_hidden_state
class CTTransformerForPreTraining(nn.Module):
def __init__(self, config):
super().__init__()
self.num_labels = config.num_labels
self.ct_tranformer = CTTransformer(config)
self.tagging_layer = nn.Linear(config.d_model, self.num_labels) # Assuming binary classification for simplicity
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def from_pretrained(self, model_path):
state_dict = torch.load(model_path, weights_only=True)
self.ct_tranformer.load_state_dict(state_dict)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
labels: torch.Tensor,
) -> Tuple[torch.Tensor]:
last_hidden_state = self.ct_tranformer(input_ids, attention_mask)
last_hidden_state = self.dropout(last_hidden_state)
logits = self.tagging_layer(last_hidden_state)
loss = None
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss, logits)
return outputs
# def generate_inputs(batch_size, max_len):
# input_ids = []
# for _ in range(batch_size):
# seq_length = random.randint(5,10)
# token_ids = np.random.randint(1,config.vocab_size,(seq_length,))
# token_ids = np.append(token_ids,np.zeros(max_len-len(token_ids)),axis=0)
# input_ids.append(torch.tensor(token_ids).long())
# input_ids = torch.stack(input_ids)
# attention_mask = (input_ids != 0).long()
# return input_ids, attention_mask
# if __name__ == '__main__':
# config = CTConfig('config.json')
# # batch_size = 4
# max_len = 32
# from tokenizer import Tokenizer
# tokenizer = Tokenizer('vocab.json')
# text = "Hello, 你好!This is a test."
# input_ids, attention_mask = tokenizer.encode(text, max_len)
# input_ids, attention_mask = torch.tensor(input_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0)
# # input_ids, attention_mask = generate_inputs(batch_size, max_len)
# model = CTTransformer(config)
# outputs = model(input_ids, attention_mask)
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"framework": "pytorch",
"task" : "punctuation",
"model" : {
"type" : "generic-punc",
"punc_model_name" : "punc.pb",
"punc_model_config" : {
"type": "pytorch",
"code_base": "funasr",
"mode": "punc",
"lang": "zh-cn",
"batch_size": 1,
"punc_config": "punc.yaml",
"model": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
}
},
"pipeline": {
"type":"punc-inference"
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
import logging
import torch
from torch import nn
from torch.utils.data.distributed import DistributedSampler
from torch.optim import Adam
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer
from config import CTConfig
from model import CTTransformerForPreTraining
import numpy as np
import random
import argparse
from tqdm import tqdm, trange
logger = logging.getLogger(__name__)
def collate_fn(batch):
new_batch = { key: [] for key in batch[0].keys()}
for b in batch:
for key in new_batch:
new_batch[key].append(torch.tensor(b[key]))
for b in new_batch:
new_batch[b] = torch.stack(new_batch[b])
return new_batch
# 定义数据集
class CTDataset(Dataset):
def __init__(self, config, tokenizer, max_length):
self.data = open(config.train_data,"r",encoding="utf-8").readlines()
self.texts, self.labels = self.read_data(self.data)
self.label_list = label_list
self.label2id = label2id
self.id2label = id2label
self.num_labels = num_labels
self.tokenizer = tokenizer
self.max_length = max_length
self.pad_label_id = config.pad_label_id
def read_data(self, data):
texts, labels = [], []
for sent in data:
sent = eval(sent)
text, label = [], []
for token_label in sent:
token, tag = token_label.split("|")
text.append(token)
label.append(tag)
texts.append(text)
labels.append(label)
return texts, labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
assert len(text) == len(label)
token_ids, attention_mask = self.tokenizer(
text,
# padding='max_length',
# truncation=True,
max_length=self.max_length,
is_split_into_words=True,
# return_tensors='pt'
)
label_ids = [ label2id[i] for i in label]
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
inputs = {
"token_ids": token_ids,
"attention_mask": attention_mask,
"label_ids": label_ids
}
return inputs
def train(args, model, train_dataset):
train_batch_size = args.per_gpu_batch_size * max(1, args.n_gpu)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
# 采样器
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, sampler=train_sampler, batch_size=train_batch_size)
# 初始化优化器和学习率调度器
optimizer = Adam(model.parameters(), lr=args.lr)
# total_steps = len(train_dataloader) // args.epochs
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
training_loss = 0
global_step = 0
model.zero_grad()
epoch_iterator = trange(0, int(args.epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args)
for epoch in epoch_iterator:
pbar = tqdm(train_dataloader, desc="Training", disable=args.local_rank not in [-1, 0])
for batch in pbar:
model.train()
token_ids = batch['token_ids'].to(args.device)
attention_mask = batch['attention_mask'].to(args.device)
label_ids = batch['label_ids'].to(args.device)
outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0]
epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6)))
if args.n_gpu > 1:
loss = loss.mean()
loss.backward()
training_loss += loss.item()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# scheduler.step()
optimizer.step()
model.zero_grad()
global_step += 1
if args.local_rank in [-1, 0]:
model_to_save = model.module if hasattr(model, "module") else model
model_to_save = model_to_save.ct_tranformer
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
save_path = os.path.join(args.output_dir, f"epoch:{epoch+1}_loss:{training_loss / global_step}.pt")
torch.save(model_to_save.state_dict(), save_path)
if args.local_rank in [-1, 0]:
model_to_save = model.module if hasattr(model, "module") else model
model_to_save = model_to_save.ct_tranformer
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
save_path = os.path.join(args.output_dir, "model.pt")
torch.save(model_to_save.state_dict(), save_path)
logger.info(f"Training loss: {training_loss / global_step}")
# 设置随机种子
def set_seed(args):
seed = args.seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(seed)
def parse_args():
parser = argparse.ArgumentParser()
## DDP:从外部得到local_rank参数。从外面得到local_rank参数,在调用DDP的时候,其会自动给出这个参数
parser.add_argument("--local-rank", default=-1, type=int)
parser.add_argument("--no_cuda", action="store_true", help="Whether to cuda.")
parser.add_argument("--seed", default=2024, type=int)
parser.add_argument("--data_dir", default="./pretrain_data", type=str)
parser.add_argument("--output_dir", default="./outputs/", type=str)
parser.add_argument("--epochs", default=3, type=int)
parser.add_argument("--lr", default=5e-5, type=float)
parser.add_argument("--per_gpu_batch_size", default=8, type=int)
parser.add_argument("--max_length", default=128, type=int)
args = parser.parse_args()
return args
# 主程序
if __name__ == "__main__":
args = parse_args()
# 设置随机种子
if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}")
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")
args.n_gpu = 1
args.device = device
# Setup logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
set_seed(args)
config = CTConfig('config.json')
# 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
label2id = {j:i for i,j in enumerate(label_list)}
id2label = {i:j for i,j in enumerate(label_list)}
num_labels = len(label_list)
config.label_list = label_list
config.label2id = label2id
config.id2label = id2label
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
model.to(args.device)
config.train_data = os.path.join(args.data_dir,'data.txt')
pretrain_dataset = CTDataset(config, tokenizer, args.max_length)
train(args, model, pretrain_dataset)
CUDA_VISIBLE_DEVICES="4,5,6,7" python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \
--epochs 20 \
--per_gpu_batch_size 64 \
--lr 5e-5 \
--output_dir ./outputs \
--seed 2024
\ No newline at end of file
import json
import re
# import string
# zh_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
# en_punct = string.punctuation
# punct_list = zh_punct + en_punct
class Tokenizer:
def __init__(self, vocab_path):
self.vocab = self.load_vocab(vocab_path)
self.sentence_start_token = '<s>'
self.sentence_end_token = '</s>'
self.padding_token = '<pad>'
def load_vocab(self, vocab_path):
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab = json.load(f)
return vocab
def ids2tokens(self, token_ids):
return [self.vocab[token_id] for token_id in token_ids]
def tokens2ids(self, tokens):
return [self.vocab.index(token) for token in tokens]
def tokenize(self, text):
text = text.lower()
# text = [char for char in text if char not in punct_list]
# text = ''.join(text)
# 匹配单个中文字符、英文单词和其他字符,包括空格
pattern = r'[\u4e00-\u9fa5]|[a-zA-Z]+|\s|[^a-zA-Z\s]'
tokens = re.findall(pattern, text)
# 处理英文单词、中文字符和空格
tokenize_result = []
for token in tokens:
if token in self.vocab:
if token.isalpha() and token.isascii():
tokenize_result.append(token)
elif token in (chr(i) for i in range(0x4E00, 0x9FA5 + 1)):
tokenize_result.append(token)
else:
tokenize_result.append(token) # 处理其他符号
else:
if not token.isspace():
tokenize_result.append('<unk>')
return tokenize_result
def __call__(self, text, max_length=None, is_split_into_words=None):
if is_split_into_words:
tokens = text
else:
tokens = self.tokenize(text)
if len(tokens) > max_length - 2:
tokens = tokens[:max_length-2] # 截断
tokens = [self.sentence_start_token] + tokens + [self.sentence_end_token]
attention_mask = [1] * len(tokens)
# 填充
padding_length = max_length - len(tokens)
tokens.extend([self.padding_token] * padding_length)
attention_mask.extend([0]*padding_length)
token_ids = self.tokens2ids(tokens)
assert len(token_ids) == len(attention_mask)
return token_ids, attention_mask
# if __name__ == '__main__':
# tokenizer = Tokenizer('vocab.json')
# print(len(tokenizer.vocab))
# text = "Hello, 你好!This is a test."
# token_ids, attention_mask = tokenizer.encode(text, 32)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment