Commit 29a4e564 by tongtao.ling

Initial commit

parents
__pycache__/
dataset/
output/
pretrain_data/
\ No newline at end of file
{
"vocab_size":272833,
"d_model": 256,
"num_attention_heads":8,
"max_position_embeddings":128,
"hidden_dropout_prob":0.1,
"attention_probs_dropout_prob":0.1,
"layer_norm_eps":1e-12,
"pad_token_id":0,
"pad_label_id":-100,
"intermediate_size":1024,
"num_hidden_layers": 4,
"L_N":9
}
\ No newline at end of file
import json
class CTConfig:
def __init__(self, json_file):
with open(json_file, 'r') as file:
self._config_data = json.load(file)
# 将字典转化为对象属性
for key, value in self._config_data.items():
setattr(self, key, value)
def __getattr__(self, item):
if item in self._config_data:
return self._config_data[item]
raise AttributeError(f"'Config' object has no attribute '{item}'")
\ No newline at end of file
import random
import json
import jieba
import re
import time
import os
import pandas as pd
import string
import threading
from tqdm import tqdm
from tokenizer import Tokenizer
chinese_punctuation = ",。!?;:()【】《》“”‘’、"
punctuation_list = string.punctuation + chinese_punctuation
interregnum_list = ['吗', '吧', '呀', '呃', '呐', '呗', '呢', '呵', '哇', '哉', '哎', '哩', '唔', '唸', '啦', '啵', '嘛', '嘞', '欤', '啊', '哦', '恩', '嗯']
# interregnum_list = [token for token, pos in dt.word_tag_tab.items() if pos == "y"]
# interregnum_list.extend(["啊","哦","哎","恩","嗯"])
# interregnum_list.remove("吧")
# tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
tokenizer = Tokenizer("vocab.json")
def process_text(sent):
sent = sent.replace("\n","").replace("\\n","").replace("|","")
sent = sent.replace("\u3000","").replace("\u2006","").replace("\u200e","").replace("\u200b","").replace(u"\xa0","").replace("\uf04a","").replace(u"\xad","").replace(u"\ufeff","")
sent = remove_special_chars(sent)
return sent
def remove_special_chars(input_string):
# 匹配中英文字符和空格
return re.sub(r'[^\w\s\u4e00-\u9fa5]+', '', input_string)
def create_repetitions(sentence):
k_rm = random.randint(1,3)
k_im = random.randint(1,2)
k = k_rm + k_im
if not sentence:
return None
# raise TypeError(
# """A 'NoneType' object received while a 'str' object is required."""
# )
else:
words = jieba.lcut(sentence)
words = list(filter(None, words))
# words = [word for word in words if word not in [' ','']]
# assert '' not in words and ' ' not in words
# words = tokenizer.tokenize(sentence)
words_without_rm_im = [ word for word in words if word not in punctuation_list and word not in interregnum_list ]
if len(words_without_rm_im) < k:
return None
indices = []
all_indexes = list(range(len(words)))
random.shuffle(all_indexes)
count = 0
idx = 0
while count < k:
if words[all_indexes[idx]] not in punctuation_list and words[all_indexes[idx]] not in interregnum_list:
indices.append(all_indexes[idx])
count+=1
idx+=1
assert len(indices) == k
# selected_words = [words[i] for i in indices]
# print(f"selected_words:{selected_words}")
rm_indices = indices[:k_rm]
im_indices = indices[k_rm:]
tokens = []
labels = []
for idx,word in enumerate(words):
if word == ' ':
continue
if idx in rm_indices:
if len(word) == 1:
options = [1, 2, 3]
weights = [0.4, 0.35, 0.25]
# elif len(word) == 2:
# options = [1, 2, 3]
# weights = [0.4, 0.35, 0.25]
else:
options = [1, 2]
weights = [0.5, 0.5]
choices = random.choices(options, weights=weights, k=1)[0]
if word.isalpha() and word.isascii():
new_word = [ word ] * choices
new_word = ' '.join(new_word)
else:
new_word = word * choices
# start = time.time()
new_word = tokenizer.tokenize(new_word)
# end = time.time()
# print("tokenize时间:%.2f秒"%(end-start))
tokens.extend(new_word)
labels.append("B-RM")
labels.extend(["I-RM"]*(len(new_word)-1))
word = tokenizer.tokenize(word)
tokens.extend(word)
labels.append("B-RP")
labels.extend(["I-RP"]*(len(word)-1))
# print(f"words:{words}")
# print(f"word:{word}")
# print(tokens)
# print(labels)
assert len(tokens) == len(labels)
elif idx in im_indices:
word = tokenizer.tokenize(word)
tokens.extend(word)
labels.extend(["O"]*len(word))
im_word = random.choice(interregnum_list)
im_word = tokenizer.tokenize(im_word)
tokens.extend(im_word)
labels.append("B-IM")
labels.extend(["I-IM"]*(len(im_word)-1))
assert len(tokens) == len(labels)
else:
word = tokenizer.tokenize(word)
tokens.extend(word)
labels.extend(["O"]*len(word))
assert len(tokens) == len(labels)
assert len(tokens) == len(labels)
token_label = [ token + "|" + label for token,label in zip(tokens,labels)]
# print(token_label)
return token_label
def create_csc(data_dir, output_dir):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
for split in ["train","dev","test"]:
# for split in ["test"]:
data = json.load(open(os.path.join(data_dir, f"{split}.json"),"r",encoding="utf-8"))
data = data[:100]
token_label_data = []
for item in data:
# sentence = "地处武林路,相当繁华的地带啊,内部环境不错,很干净,地上的头发会当即清扫。发型师的服务态度也很好,当然价格是比较高的,"
sentence = item["correct_text"]
# sentence = process_text(sentence)
if item["wrong_ids"] == []:
token_label = [ token+"|O" for token in sentence]
else:
# start = time.time()
token_label = create_repetitions(sentence)
# end = time.time()
# print("create_repetitions时间:%.2f秒"%(end-start))
if token_label == None:
continue
token_label_data.append(token_label)
random.shuffle(token_label_data)
with open(os.path.join(output_dir, f"{split}.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
def create_lang_8(data_dir, output_dir):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
print(f"data size: {len(data)}")
token_label_data = []
for item in tqdm(data):
sentence = item.split("\t")[-1]
sentence = process_text(sentence)
if len(item.split("\t")) == 3:
token_label = [ token+"|O" for token in sentence]
else:
token_label = create_repetitions(sentence)
if token_label == None:
continue
token_label_data.append(token_label)
# random.shuffle(token_label_data)
# split_index = int(len(token_label_data)*0.95)
# train_data = token_label_data[:split_index]
# dev_data = token_label_data[split_index:]
with open(os.path.join(output_dir, "data.txt"),"w",encoding="utf-8") as f:
for i in token_label_data:
f.write(str(i)+"\n")
# with open(os.path.join(output_dir, "dev.txt"),"w",encoding="utf-8") as f:
# for i in dev_data:
# f.write(str(i)+"\n")
# test_dataset = open(os.path.join(data_dir,"TestData_Task2/source.txt"),"r",encoding="utf-8").readlines()
# test_data = []
# for sentence in test_dataset:
# sentence = process_text(sentence)
# token_label = create_repetitions(sentence)
# if token_label == None:
# continue
# test_data.append(token_label)
# with open(os.path.join(output_dir, "test.txt"),"w",encoding="utf-8") as f:
# for i in test_data:
# f.write(str(i)+"\n")
# class DatasetThread(threading.Thread): #继承父类threading.Thread
# def __init__(self, threadID, name, counter):
# threading.Thread.__init__(self)
# self.threadID = threadID
# self.name = name
# self.counter = counter
# def run(self): #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
# print("Starting:",self.name)
# create_csc("./csc", "./data")
# print "Exiting " + self.name
if __name__ == "__main__":
create_lang_8("./lang-8", "./pretrain_data")
# create_csc("./csc", "./data")
# 创建两个线程
# try:
# thread.start_new_thread( print_time, ("Thread-1", 2, ) )
# thread.start_new_thread( print_time, ("Thread-2", 4, ) )
# except:
# print "Error: unable to start thread"
# while 1:
# pass
This diff is collapsed. Click to expand it.
#!/bin/bash
OUTPUT_DIR='./output/finetune_output'
MODEL_PATH='./output/pretrain_output/model.pt'
CUDA_VISIBLE_DEVICES='2' python fintune.py \
--do_train \
--do_eval \
--model_path $MODEL_PATH \
--output_dir $OUTPUT_DIR \
--evaluate_after_epoch \
--per_gpu_train_batch_size 42 \
--per_gpu_eval_batch_size 42 \
--dropout_prob 0.1 \
--max_seq_length 256 \
--learning_rate 3e-5 \
--weight_decay 5e-5 \
--num_train_epochs 10 \
--seed 42
\ No newline at end of file
This diff is collapsed. Click to expand it.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"framework": "pytorch",
"task" : "punctuation",
"model" : {
"type" : "generic-punc",
"punc_model_name" : "punc.pb",
"punc_model_config" : {
"type": "pytorch",
"code_base": "funasr",
"mode": "punc",
"lang": "zh-cn",
"batch_size": 1,
"punc_config": "punc.yaml",
"model": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
}
},
"pipeline": {
"type":"punc-inference"
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
import logging
import torch
from torch import nn
from torch.utils.data.distributed import DistributedSampler
from torch.optim import Adam
import pandas as pd
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tokenizer import Tokenizer
from config import CTConfig
from model import CTTransformerForPreTraining
import numpy as np
import random
import argparse
from tqdm import tqdm, trange
logger = logging.getLogger(__name__)
def collate_fn(batch):
new_batch = { key: [] for key in batch[0].keys()}
for b in batch:
for key in new_batch:
new_batch[key].append(torch.tensor(b[key]))
for b in new_batch:
new_batch[b] = torch.stack(new_batch[b])
return new_batch
# 定义数据集
class CTDataset(Dataset):
def __init__(self, config, tokenizer, max_length):
self.data = open(config.train_data,"r",encoding="utf-8").readlines()
self.texts, self.labels = self.read_data(self.data)
self.label_list = label_list
self.label2id = label2id
self.id2label = id2label
self.num_labels = num_labels
self.tokenizer = tokenizer
self.max_length = max_length
self.pad_label_id = config.pad_label_id
def read_data(self, data):
texts, labels = [], []
for sent in data:
sent = eval(sent)
text, label = [], []
for token_label in sent:
token, tag = token_label.split("|")
text.append(token)
label.append(tag)
texts.append(text)
labels.append(label)
return texts, labels
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
assert len(text) == len(label)
token_ids, attention_mask = self.tokenizer(
text,
# padding='max_length',
# truncation=True,
max_length=self.max_length,
is_split_into_words=True,
# return_tensors='pt'
)
label_ids = [ label2id[i] for i in label]
label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
inputs = {
"token_ids": token_ids,
"attention_mask": attention_mask,
"label_ids": label_ids
}
return inputs
def train(args, model, train_dataset):
train_batch_size = args.per_gpu_batch_size * max(1, args.n_gpu)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
# 采样器
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, sampler=train_sampler, batch_size=train_batch_size)
# 初始化优化器和学习率调度器
optimizer = Adam(model.parameters(), lr=args.lr)
# total_steps = len(train_dataloader) // args.epochs
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
training_loss = 0
global_step = 0
model.zero_grad()
epoch_iterator = trange(0, int(args.epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args)
for epoch in epoch_iterator:
pbar = tqdm(train_dataloader, desc="Training", disable=args.local_rank not in [-1, 0])
for batch in pbar:
model.train()
token_ids = batch['token_ids'].to(args.device)
attention_mask = batch['attention_mask'].to(args.device)
label_ids = batch['label_ids'].to(args.device)
outputs = model(token_ids, attention_mask, label_ids)
loss = outputs[0]
epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6)))
if args.n_gpu > 1:
loss = loss.mean()
loss.backward()
training_loss += loss.item()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# scheduler.step()
optimizer.step()
model.zero_grad()
global_step += 1
if args.local_rank in [-1, 0]:
model_to_save = model.module if hasattr(model, "module") else model
model_to_save = model_to_save.ct_tranformer
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
save_path = os.path.join(args.output_dir, f"epoch:{epoch+1}_loss:{training_loss / global_step}.pt")
torch.save(model_to_save.state_dict(), save_path)
if args.local_rank in [-1, 0]:
model_to_save = model.module if hasattr(model, "module") else model
model_to_save = model_to_save.ct_tranformer
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
save_path = os.path.join(args.output_dir, "model.pt")
torch.save(model_to_save.state_dict(), save_path)
logger.info(f"Training loss: {training_loss / global_step}")
# 设置随机种子
def set_seed(args):
seed = args.seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(seed)
def parse_args():
parser = argparse.ArgumentParser()
## DDP:从外部得到local_rank参数。从外面得到local_rank参数,在调用DDP的时候,其会自动给出这个参数
parser.add_argument("--local-rank", default=-1, type=int)
parser.add_argument("--no_cuda", action="store_true", help="Whether to cuda.")
parser.add_argument("--seed", default=2024, type=int)
parser.add_argument("--data_dir", default="./pretrain_data", type=str)
parser.add_argument("--output_dir", default="./outputs/", type=str)
parser.add_argument("--epochs", default=3, type=int)
parser.add_argument("--lr", default=5e-5, type=float)
parser.add_argument("--per_gpu_batch_size", default=8, type=int)
parser.add_argument("--max_length", default=128, type=int)
args = parser.parse_args()
return args
# 主程序
if __name__ == "__main__":
args = parse_args()
# 设置随机种子
if os.path.exists(args.output_dir):
os.system(f"rm -rf {args.output_dir}")
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend="nccl")
args.n_gpu = 1
args.device = device
# Setup logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
set_seed(args)
config = CTConfig('config.json')
# 加载Tokenizer和模型
tokenizer = Tokenizer('vocab.json')
label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
label2id = {j:i for i,j in enumerate(label_list)}
id2label = {i:j for i,j in enumerate(label_list)}
num_labels = len(label_list)
config.label_list = label_list
config.label2id = label2id
config.id2label = id2label
config.num_labels = num_labels
model = CTTransformerForPreTraining(config)
model.to(args.device)
config.train_data = os.path.join(args.data_dir,'data.txt')
pretrain_dataset = CTDataset(config, tokenizer, args.max_length)
train(args, model, pretrain_dataset)
CUDA_VISIBLE_DEVICES="4,5,6,7" python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \
--epochs 20 \
--per_gpu_batch_size 64 \
--lr 5e-5 \
--output_dir ./outputs \
--seed 2024
\ No newline at end of file
import json
import re
# import string
# zh_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
# en_punct = string.punctuation
# punct_list = zh_punct + en_punct
class Tokenizer:
def __init__(self, vocab_path):
self.vocab = self.load_vocab(vocab_path)
self.sentence_start_token = '<s>'
self.sentence_end_token = '</s>'
self.padding_token = '<pad>'
def load_vocab(self, vocab_path):
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab = json.load(f)
return vocab
def ids2tokens(self, token_ids):
return [self.vocab[token_id] for token_id in token_ids]
def tokens2ids(self, tokens):
return [self.vocab.index(token) for token in tokens]
def tokenize(self, text):
text = text.lower()
# text = [char for char in text if char not in punct_list]
# text = ''.join(text)
# 匹配单个中文字符、英文单词和其他字符,包括空格
pattern = r'[\u4e00-\u9fa5]|[a-zA-Z]+|\s|[^a-zA-Z\s]'
tokens = re.findall(pattern, text)
# 处理英文单词、中文字符和空格
tokenize_result = []
for token in tokens:
if token in self.vocab:
if token.isalpha() and token.isascii():
tokenize_result.append(token)
elif token in (chr(i) for i in range(0x4E00, 0x9FA5 + 1)):
tokenize_result.append(token)
else:
tokenize_result.append(token) # 处理其他符号
else:
if not token.isspace():
tokenize_result.append('<unk>')
return tokenize_result
def __call__(self, text, max_length=None, is_split_into_words=None):
if is_split_into_words:
tokens = text
else:
tokens = self.tokenize(text)
if len(tokens) > max_length - 2:
tokens = tokens[:max_length-2] # 截断
tokens = [self.sentence_start_token] + tokens + [self.sentence_end_token]
attention_mask = [1] * len(tokens)
# 填充
padding_length = max_length - len(tokens)
tokens.extend([self.padding_token] * padding_length)
attention_mask.extend([0]*padding_length)
token_ids = self.tokens2ids(tokens)
assert len(token_ids) == len(attention_mask)
return token_ids, attention_mask
# if __name__ == '__main__':
# tokenizer = Tokenizer('vocab.json')
# print(len(tokenizer.vocab))
# text = "Hello, 你好!This is a test."
# token_ids, attention_mask = tokenizer.encode(text, 32)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment