[논문 리뷰] BERT를 사용한 NLI 분류
라이브러리 및 TPU 설정
pip install transformers
!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
impot torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpointr
import torch_xla
import torch_xla.core.xla_model as xm
데이터 로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/multinli.train.ko.tsv", filename="multinli.train.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/snli_1.0_train.ko.tsv", filename="snli_1.0_train.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.dev.ko.tsv", filename="xnli.dev.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.test.ko.tsv", filename="xnli.test.ko.tsv")
train_snli = pd.read_csv("snli_1.0_train.ko.tsv", sep='\t', quoting=3)
train_xnli = pd.read_csv("multinli.train.ko.tsv", sep='\t', quoting=3)
val_data = pd.read_csv("xnli.dev.ko.tsv", sep='\t', quoting=3)
test_data = pd.read_csv("xnli.test.ko.tsv", sep='\t', quoting=3)
데이터 전처리
train_data = train_snli.append(train_xnli)
train_data = train_data.sample(frac=1)
def drop_na_and_duplciates(df):
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)
return df
train_data = drop_na_and_duplciates(train_data)
val_data = drop_na_and_duplciates(val_data)
test_data = drop_na_and_duplciates(test_data)
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")
max_seq_len = 128
def Preprocessing(sent_list1,sent_list2,max_seq_len,tokenizer):
input_ids = []
attention_masks = []
token_type_ids = []
for sent1, sent2 in tqdm(zip(sent_list1,sent_list2), total=len(sent_list1)):
encoding_result = tokenizer.encode_plus(sent1,sent2,max_length=max_seq_len,pad_to_max_length=True)
input_ids.append(encoding_result["input_ids"])
attention_masks.append(encoding_result["attention_mask"])
token_type_ids.append(encoding_result["token_type_ids"])
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
token_type_ids = torch.tensor(token_type_ids)
return (input_ids, attention_masks, token_type_ids)
X_train = Preprocessing(train_data["sentence1"],train_data["sentence2"],max_seq_len=max_seq_len,tokenizer=tokenizer)
X_val = Preprocessing(val_data["sentence1"],val_data["sentence2"],max_seq_len=max_seq_len,tokenizer=tokenizer)
X_test = Preprocessing(test_data["sentence1"],test_data["sentence2"],max_seq_len=max_seq_len,tokenizer=tokenizer)
train_label = train_data["gold_label"].tolist()
val_label = val_data["gold_label"].tolist()
test_label = test_data["gold_label"].tolist()
idx_encode = preprocessing.LabelEncoder()
idx_encode.fit(train_label)
y_train = idx_encode.transform(train_label)
y_val = idx_encode.transform(val_label)
y_test = idx_encode.transform(test_label)
label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
idx_label = {value : key for key ,value in label_idx.items()}
모델 구현
device = xm.xla_device()
model = BertForSequenceClassification.from_pretrained("klue/bert-base",num_labels=3).to(device)
optimizer = optim.Adam(model.parameters(),lr=5e-5)
criterion = nn.CrossEntropyLoss()
input_ids_train = X_train[0]
attention_masks_train = X_train[1]
token_type_ids_train = X_train[2]
input_ids_val = X_val[0]
attention_masks_val = X_val[1]
token_type_ids_val = X_val[2]
input_ids_test = X_test[0]
attention_masks_test = X_test[1]
token_type_ids_test = X_test[2]
y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)
y_test = torch.tensor(y_test)
train_tensor = TensorDataset(input_ids_train,attention_masks_train,token_type_ids_train,y_train)
val_tensor = TensorDataset(input_ids_val,attention_masks_val,token_type_ids_val,y_val)
test_tensor = TensorDataset(input_ids_test,attention_masks_test,token_type_ids_test,y_test)
train_loader = DataLoader(train_tensor,batch_size=128,shuffle=True,drop_last=True)
val_loader = DataLoader(val_tensor,batch_size=128,shuffle=False)
test_loader = DataLoader(test_tensor,batch_size=128,shuffle=False)
for epoch in range(2):
avg_loss = 0
for batch in train_loader:
input_id = batch[0].to(device)
attention_mask = batch[1].to(device)
token_type_id = batch[2].to(device)
target = batch[3].to(device)
logits = model(input_id,attention_mask,token_type_ids = token_type_id,labels=target)
loss = logits[0]
optimizer.zero_grad()
loss.backward()
xm.optimizer_step(optimizer,barrier=True)
avg_loss += loss / len(train_loader)
print("epoch : {} 일때 loss : {}".format(epoch+1,avg_loss))
def test():
model.eval()
corrected = 0
for batch in test_loader:
input_id = batch[0].to(device)
attention_mask = batch[1].to(device)
token_type_id = batch[2].to(device)
target = batch[3].to(device)
logits = model(input_id,attention_mask,token_type_id)
corrected += (logits[0].cpu().argmax(dim=-1) == target.cpu()).sum()
print("테스트 데이터 정확도 : {:.2f}%".format(corrected / len(test_loader.dataset) * 100))
test()