Initial commit

29a4e564 · tongtao.ling · 29a4e564 · 29a4e564 · 29a4e564 · 29a4e564
Commit 29a4e564 authored Sep 26, 2024 by tongtao.ling
14 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__/
+dataset/
+output/
+pretrain_data/
\ No newline at end of file
--- a/config.json
+++ b/config.json
+{
+    "vocab_size":272833,
+    "d_model": 256,
+    "num_attention_heads":8,
+    "max_position_embeddings":128,
+    "hidden_dropout_prob":0.1,
+    "attention_probs_dropout_prob":0.1,
+    "layer_norm_eps":1e-12,
+    "pad_token_id":0,
+    "pad_label_id":-100,
+    "intermediate_size":1024,
+    "num_hidden_layers": 4,
+    "L_N":9
+}
\ No newline at end of file
--- a/config.py
+++ b/config.py
+import json
+
+class CTConfig:
+    def __init__(self, json_file):
+        with open(json_file, 'r') as file:
+            self._config_data = json.load(file)
+
+        # 将字典转化为对象属性
+        for key, value in self._config_data.items():
+            setattr(self, key, value)
+
+    def __getattr__(self, item):
+        if item in self._config_data:
+            return self._config_data[item]
+        raise AttributeError(f"'Config' object has no attribute '{item}'")
+    
\ No newline at end of file
--- a/create_dataset.py
+++ b/create_dataset.py
+import random
+import json
+import jieba
+import re
+import time
+import os
+import pandas as pd
+import string
+import threading
+from tqdm import tqdm
+from tokenizer import Tokenizer
+chinese_punctuation = "，。！？；：（）【】《》“”‘’、"
+punctuation_list = string.punctuation + chinese_punctuation
+
+interregnum_list = ['吗', '吧', '呀', '呃', '呐', '呗', '呢', '呵', '哇', '哉', '哎', '哩', '唔', '唸', '啦', '啵', '嘛', '嘞', '欤', '啊', '哦', '恩', '嗯']
+
+# interregnum_list = [token for token, pos in dt.word_tag_tab.items() if pos == "y"]
+# interregnum_list.extend(["啊","哦","哎","恩","嗯"])
+# interregnum_list.remove("吧")
+
+# tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
+
+tokenizer = Tokenizer("vocab.json")
+
+def process_text(sent):
+    sent = sent.replace("\n","").replace("\\n","").replace("|","")
+    sent = sent.replace("\u3000","").replace("\u2006","").replace("\u200e","").replace("\u200b","").replace(u"\xa0","").replace("\uf04a","").replace(u"\xad","").replace(u"\ufeff","")
+    sent = remove_special_chars(sent)
+    return sent
+
+def remove_special_chars(input_string):
+    # 匹配中英文字符和空格
+    return re.sub(r'[^\w\s\u4e00-\u9fa5]+', '', input_string)
+
+def create_repetitions(sentence):
+    k_rm = random.randint(1,3)
+    k_im = random.randint(1,2)
+    k = k_rm + k_im
+    if not sentence:
+        return None
+        # raise TypeError(
+        #     """A 'NoneType' object received while a 'str' object is required."""
+        # )
+    else:
+        words = jieba.lcut(sentence)
+        words = list(filter(None, words))
+        # words = [word for word in words if word not in [' ','']]
+        # assert '' not in words and ' ' not in words
+        # words = tokenizer.tokenize(sentence)
+    
+    words_without_rm_im = [ word for word in words if word not in punctuation_list and word not in interregnum_list ]
+    if len(words_without_rm_im) < k:
+        return None
+
+    indices = []
+    all_indexes = list(range(len(words)))
+    random.shuffle(all_indexes)
+    count = 0
+    idx = 0
+    while count < k:
+        if words[all_indexes[idx]] not in punctuation_list and words[all_indexes[idx]] not in interregnum_list:
+            indices.append(all_indexes[idx])
+            count+=1
+        idx+=1
+    assert len(indices) == k
+
+    # selected_words = [words[i] for i in indices]
+    # print(f"selected_words:{selected_words}")
+
+    rm_indices = indices[:k_rm]
+    im_indices = indices[k_rm:]
+    tokens = []
+    labels = []
+    for idx,word in enumerate(words):
+        if word == ' ':
+            continue
+        if idx in rm_indices:
+            if len(word) == 1:
+                options = [1, 2, 3]
+                weights = [0.4, 0.35, 0.25] 
+            # elif len(word) == 2:
+            #     options = [1, 2, 3]
+            #     weights = [0.4, 0.35, 0.25] 
+            else:
+                options = [1, 2]
+                weights = [0.5, 0.5] 
+            choices = random.choices(options, weights=weights, k=1)[0]
+            if word.isalpha() and word.isascii():
+                new_word = [ word ] * choices
+                new_word = ' '.join(new_word)
+            else:  
+                new_word = word * choices
+            
+            # start = time.time()
+            new_word = tokenizer.tokenize(new_word)
+            # end = time.time()
+            # print("tokenize时间:%.2f秒"%(end-start))
+            tokens.extend(new_word)
+            labels.append("B-RM")                
+            labels.extend(["I-RM"]*(len(new_word)-1))
+
+            word = tokenizer.tokenize(word)   
+            tokens.extend(word)
+            labels.append("B-RP") 
+            labels.extend(["I-RP"]*(len(word)-1)) 
+            # print(f"words:{words}")
+            # print(f"word:{word}")
+            # print(tokens)
+            # print(labels)
+            assert len(tokens) == len(labels)
+
+        elif idx in im_indices:
+            word = tokenizer.tokenize(word)   
+            tokens.extend(word)
+            labels.extend(["O"]*len(word))
+            im_word = random.choice(interregnum_list)
+            im_word = tokenizer.tokenize(im_word)   
+            tokens.extend(im_word)
+            labels.append("B-IM")
+            labels.extend(["I-IM"]*(len(im_word)-1))
+            assert len(tokens) == len(labels)
+        else:
+            word = tokenizer.tokenize(word) 
+            tokens.extend(word)
+            labels.extend(["O"]*len(word))
+            assert len(tokens) == len(labels)
+    assert len(tokens) == len(labels)
+    token_label = [ token + "|" + label for token,label in zip(tokens,labels)]
+    # print(token_label)
+    return token_label
+
+
+def create_csc(data_dir, output_dir):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    for split in ["train","dev","test"]:
+    # for split in ["test"]:
+        data = json.load(open(os.path.join(data_dir, f"{split}.json"),"r",encoding="utf-8"))
+        data = data[:100]
+        token_label_data = []
+        for item in data:
+            # sentence = "地处武林路，相当繁华的地带啊，内部环境不错，很干净，地上的头发会当即清扫。发型师的服务态度也很好，当然价格是比较高的，"
+            sentence = item["correct_text"]
+            # sentence = process_text(sentence)
+            if item["wrong_ids"] == []:
+                token_label = [ token+"|O"  for token in sentence]
+            else:
+                # start = time.time()
+                token_label = create_repetitions(sentence)
+                # end = time.time()
+                # print("create_repetitions时间:%.2f秒"%(end-start))
+                if token_label == None:
+                    continue
+            token_label_data.append(token_label)
+
+        random.shuffle(token_label_data)
+        with open(os.path.join(output_dir, f"{split}.txt"),"w",encoding="utf-8") as f:
+            for i in token_label_data:
+                f.write(str(i)+"\n")    
+
+def create_lang_8(data_dir, output_dir):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    data = open(os.path.join(data_dir,"NLPCC2018_GEC_TrainingData/data.train"),"r",encoding="utf-8").readlines()
+    print(f"data size: {len(data)}")
+    token_label_data = []
+    for item in tqdm(data):
+        sentence = item.split("\t")[-1]
+        sentence = process_text(sentence)
+        if len(item.split("\t")) == 3:
+            token_label = [ token+"|O"  for token in sentence]
+        else:
+            token_label = create_repetitions(sentence)
+            if token_label == None:
+                continue
+        token_label_data.append(token_label)
+    # random.shuffle(token_label_data)
+    # split_index = int(len(token_label_data)*0.95)
+    # train_data = token_label_data[:split_index]
+    # dev_data = token_label_data[split_index:]
+
+    with open(os.path.join(output_dir, "data.txt"),"w",encoding="utf-8") as f:
+        for i in token_label_data:
+            f.write(str(i)+"\n")   
+    # with open(os.path.join(output_dir, "dev.txt"),"w",encoding="utf-8") as f:
+    #     for i in dev_data:
+    #         f.write(str(i)+"\n")   
+
+
+#     test_dataset = open(os.path.join(data_dir,"TestData_Task2/source.txt"),"r",encoding="utf-8").readlines()
+#     test_data = []
+#     for sentence in test_dataset:
+#         sentence = process_text(sentence)
+#         token_label = create_repetitions(sentence)
+#         if token_label == None:
+#             continue
+#         test_data.append(token_label) 
+
+#     with open(os.path.join(output_dir, "test.txt"),"w",encoding="utf-8") as f:
+#         for i in test_data:
+#             f.write(str(i)+"\n")   
+
+# class DatasetThread(threading.Thread):   #继承父类threading.Thread
+#     def __init__(self, threadID, name, counter):
+#         threading.Thread.__init__(self)
+#         self.threadID = threadID
+#         self.name = name
+#         self.counter = counter
+#     def run(self):                   #把要执行的代码写到run函数里面 线程在创建后会直接运行run函数 
+#         print("Starting:",self.name)
+#         create_csc("./csc", "./data")
+#         print "Exiting " + self.name
+
+if __name__ == "__main__":
+    create_lang_8("./lang-8", "./pretrain_data")
+    # create_csc("./csc", "./data")
+    # 创建两个线程
+    # try:
+    #    thread.start_new_thread( print_time, ("Thread-1", 2, ) )
+    #    thread.start_new_thread( print_time, ("Thread-2", 4, ) )
+    # except:
+    #    print "Error: unable to start thread"
+    
+    # while 1:
+    #    pass
+
--- a/finetune.py
+++ b/finetune.py
--- a/finetune.sh
+++ b/finetune.sh
+#!/bin/bash
+
+
+OUTPUT_DIR='./output/finetune_output'
+MODEL_PATH='./output/pretrain_output/model.pt'
+CUDA_VISIBLE_DEVICES='2' python fintune.py \
+    --do_train \
+    --do_eval \
+    --model_path $MODEL_PATH \
+    --output_dir $OUTPUT_DIR \
+    --evaluate_after_epoch \
+    --per_gpu_train_batch_size 42 \
+    --per_gpu_eval_batch_size 42 \
+    --dropout_prob 0.1 \
+    --max_seq_length 256 \
+    --learning_rate 3e-5 \
+    --weight_decay 5e-5 \
+    --num_train_epochs 10 \
+    --seed 42
\ No newline at end of file
--- a/model.py
+++ b/model.py
--- a/model_config/models_punc_ct-transformer_zh-cn-common-vocab272727-pytorch_config.yaml
+++ b/model_config/models_punc_ct-transformer_zh-cn-common-vocab272727-pytorch_config.yaml
--- a/model_config/models_punc_ct-transformer_zh-cn-common-vocab272727-pytorch_configuration.json
+++ b/model_config/models_punc_ct-transformer_zh-cn-common-vocab272727-pytorch_configuration.json
+{
+  "framework": "pytorch",
+  "task" : "punctuation",
+  "model" : {
+    "type" : "generic-punc",
+    "punc_model_name" : "punc.pb",
+    "punc_model_config" : {
+      "type": "pytorch",
+      "code_base": "funasr",
+      "mode": "punc",
+      "lang": "zh-cn",
+      "batch_size": 1,
+      "punc_config": "punc.yaml",
+      "model": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+    }
+  },
+  "pipeline": {
+    "type":"punc-inference"
+  }
+}
--- a/model_config/models_punc_ct-transformer_zh-cn-common-vocab272727-pytorch_punc.yaml
+++ b/model_config/models_punc_ct-transformer_zh-cn-common-vocab272727-pytorch_punc.yaml
--- a/pretrain.py
+++ b/pretrain.py
+import os
+import logging
+import torch
+from torch import nn
+from torch.utils.data.distributed import DistributedSampler
+
+from torch.optim import Adam
+import pandas as pd
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
+from tokenizer import Tokenizer
+from config import CTConfig
+from model import CTTransformerForPreTraining
+
+import numpy as np
+import random
+import argparse
+from tqdm import tqdm, trange
+
+
+logger = logging.getLogger(__name__)
+
+def collate_fn(batch):
+    new_batch = { key: [] for key in batch[0].keys()}
+    for b in batch:
+        for key in new_batch:
+            new_batch[key].append(torch.tensor(b[key])) 
+    for b in new_batch:
+        new_batch[b] = torch.stack(new_batch[b])
+    return new_batch
+
+# 定义数据集
+class CTDataset(Dataset):
+    def __init__(self, config, tokenizer, max_length):
+
+        self.data = open(config.train_data,"r",encoding="utf-8").readlines()
+        self.texts, self.labels = self.read_data(self.data)
+
+        self.label_list = label_list
+        self.label2id = label2id
+        self.id2label = id2label
+        self.num_labels = num_labels
+
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.pad_label_id = config.pad_label_id
+
+    def read_data(self, data):
+        texts, labels = [], []
+        for sent in data:
+            sent = eval(sent)
+            text, label = [], []
+            for token_label in sent:
+                token, tag = token_label.split("|")
+                text.append(token)
+                label.append(tag)
+            texts.append(text)
+            labels.append(label)
+        return texts, labels
+    
+    def __len__(self):
+        return len(self.texts)
+
+    def __getitem__(self, idx):
+        text = self.texts[idx]
+        label = self.labels[idx]
+        assert len(text) == len(label)
+
+        token_ids, attention_mask = self.tokenizer(
+            text,
+            # padding='max_length',
+            # truncation=True,
+            max_length=self.max_length,
+            is_split_into_words=True,
+            # return_tensors='pt'
+        )
+        label_ids = [ label2id[i] for i in label]
+        label_ids.extend([self.pad_label_id]*(self.max_length-len(label_ids)))
+        
+        assert len(token_ids) == len(attention_mask) == len(label_ids) == self.max_length
+
+        inputs = {
+            "token_ids": token_ids,
+            "attention_mask": attention_mask,
+            "label_ids": label_ids
+        }
+
+        return inputs
+
+def train(args, model, train_dataset):
+
+    train_batch_size = args.per_gpu_batch_size * max(1, args.n_gpu)
+
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=False)
+    
+    # 采样器
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, sampler=train_sampler, batch_size=train_batch_size)
+
+    # 初始化优化器和学习率调度器
+    optimizer = Adam(model.parameters(), lr=args.lr)
+
+    # total_steps = len(train_dataloader) // args.epochs
+
+    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
+
+    training_loss = 0
+    global_step = 0
+    model.zero_grad()
+    epoch_iterator = trange(0, int(args.epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)
+    for epoch in epoch_iterator:
+
+        pbar = tqdm(train_dataloader, desc="Training", disable=args.local_rank not in [-1, 0])
+
+        for batch in pbar:
+            model.train()
+            token_ids = batch['token_ids'].to(args.device)
+            attention_mask = batch['attention_mask'].to(args.device)
+            label_ids = batch['label_ids'].to(args.device)
+
+            outputs = model(token_ids, attention_mask, label_ids)
+            loss = outputs[0]
+            epoch_iterator.set_description('Epoch: {}, Loss: {}'.format(epoch+1, round(loss.item(), 6)))
+
+            if args.n_gpu > 1:
+                loss = loss.mean()
+
+            loss.backward()
+            training_loss += loss.item()
+            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+            # scheduler.step()                                         
+            optimizer.step()
+            model.zero_grad()
+            global_step += 1
+
+        if args.local_rank in [-1, 0]:
+            model_to_save = model.module if hasattr(model, "module") else model 
+            model_to_save = model_to_save.ct_tranformer
+            if not os.path.exists(args.output_dir):
+                os.mkdir(args.output_dir)
+            save_path = os.path.join(args.output_dir, f"epoch:{epoch+1}_loss:{training_loss / global_step}.pt")
+
+            torch.save(model_to_save.state_dict(), save_path)
+
+    if args.local_rank in [-1, 0]:
+        model_to_save = model.module if hasattr(model, "module") else model 
+        model_to_save = model_to_save.ct_tranformer
+        if not os.path.exists(args.output_dir):
+            os.mkdir(args.output_dir)
+        save_path = os.path.join(args.output_dir, "model.pt")
+
+        torch.save(model_to_save.state_dict(), save_path)
+
+    logger.info(f"Training loss: {training_loss / global_step}")
+
+
+# 设置随机种子
+def set_seed(args):
+    seed = args.seed
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(seed)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    ## DDP：从外部得到local_rank参数。从外面得到local_rank参数，在调用DDP的时候，其会自动给出这个参数
+    parser.add_argument("--local-rank", default=-1, type=int)
+    parser.add_argument("--no_cuda", action="store_true", help="Whether to cuda.")
+    parser.add_argument("--seed", default=2024, type=int)
+    
+    parser.add_argument("--data_dir", default="./pretrain_data", type=str)
+    parser.add_argument("--output_dir", default="./outputs/", type=str)
+    
+    parser.add_argument("--epochs", default=3, type=int)
+    parser.add_argument("--lr", default=5e-5, type=float)
+    parser.add_argument("--per_gpu_batch_size", default=8, type=int)
+    parser.add_argument("--max_length", default=128, type=int)
+
+    args = parser.parse_args()
+    return args
+
+# 主程序
+if __name__ == "__main__":
+
+    args = parse_args()
+    # 设置随机种子
+
+    if os.path.exists(args.output_dir):
+        os.system(f"rm -rf {args.output_dir}")
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
+                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
+
+    set_seed(args)
+
+    config = CTConfig('config.json')
+    # 加载Tokenizer和模型
+    tokenizer = Tokenizer('vocab.json')
+
+    label_list = ["O", "B-IM", "I-IM", "B-RM", "I-RM", "B-RP", "I-RP"]
+
+    label2id = {j:i for i,j in enumerate(label_list)}
+    id2label = {i:j for i,j in enumerate(label_list)}
+    num_labels = len(label_list)
+
+    config.label_list = label_list
+    config.label2id = label2id
+    config.id2label = id2label
+    config.num_labels = num_labels
+
+    model = CTTransformerForPreTraining(config)
+    model.to(args.device)
+
+    config.train_data = os.path.join(args.data_dir,'data.txt')
+
+    pretrain_dataset = CTDataset(config, tokenizer, args.max_length)
+    train(args, model, pretrain_dataset)
+
--- a/pretrain.sh
+++ b/pretrain.sh
+CUDA_VISIBLE_DEVICES="4,5,6,7" python -m torch.distributed.launch --nproc_per_node=4 pretrain.py \
+--epochs 20 \
+--per_gpu_batch_size 64 \
+--lr 5e-5 \
+--output_dir ./outputs \
+--seed 2024
\ No newline at end of file
--- a/tokenizer.py
+++ b/tokenizer.py
+import json
+import re
+# import string
+
+# zh_punct = "！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+# en_punct = string.punctuation
+# punct_list = zh_punct + en_punct
+
+class Tokenizer:
+    def __init__(self, vocab_path):
+        self.vocab = self.load_vocab(vocab_path)
+        self.sentence_start_token = '<s>'
+        self.sentence_end_token = '</s>'
+        self.padding_token = '<pad>'
+
+    def load_vocab(self, vocab_path):
+        with open(vocab_path, 'r', encoding='utf-8') as f:
+            vocab = json.load(f)
+        return vocab
+    
+    def ids2tokens(self, token_ids):
+
+        return [self.vocab[token_id] for token_id in token_ids]
+
+    def tokens2ids(self, tokens):
+        return [self.vocab.index(token) for token in tokens]
+    
+    def tokenize(self, text):
+        text = text.lower()
+        # text = [char for char in text if char not in punct_list]
+        # text = ''.join(text)
+        # 匹配单个中文字符、英文单词和其他字符，包括空格
+        pattern = r'[\u4e00-\u9fa5]|[a-zA-Z]+|\s|[^a-zA-Z\s]'
+        tokens = re.findall(pattern, text)
+
+        # 处理英文单词、中文字符和空格
+        tokenize_result = []
+        for token in tokens:
+            if token in self.vocab:
+                if token.isalpha() and token.isascii():
+                    tokenize_result.append(token)
+                elif token in (chr(i) for i in range(0x4E00, 0x9FA5 + 1)):
+                    tokenize_result.append(token)
+                else:
+                    tokenize_result.append(token)  # 处理其他符号
+            else:
+                if not token.isspace():
+                    tokenize_result.append('<unk>')
+
+        return tokenize_result
+
+    def __call__(self, text, max_length=None, is_split_into_words=None):
+        
+        if is_split_into_words:
+            tokens = text
+        else:
+            tokens = self.tokenize(text)
+        if len(tokens) > max_length - 2:
+            tokens = tokens[:max_length-2]  # 截断
+
+        tokens = [self.sentence_start_token] + tokens + [self.sentence_end_token]
+        attention_mask = [1] * len(tokens) 
+
+        # 填充
+        padding_length = max_length - len(tokens)
+        tokens.extend([self.padding_token] * padding_length)  
+ 
+        attention_mask.extend([0]*padding_length)
+
+        token_ids = self.tokens2ids(tokens)
+
+        assert len(token_ids) == len(attention_mask)
+
+        return token_ids, attention_mask
+
+
+# if __name__ == '__main__':
+
+#     tokenizer = Tokenizer('vocab.json')
+#     print(len(tokenizer.vocab))
+#     text = "Hello, 你好！This is a test."
+#     token_ids, attention_mask = tokenizer.encode(text, 32)
+
+
--- a/vocab.json
+++ b/vocab.json