라이브러리 및 TPU 설정

pip install transformers
!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
impot torch

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification

from sklearn.preprocessing import LabelEncoder
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpointr

import torch_xla
import torch_xla.core.xla_model as xm

데이터 로드

urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/multinli.train.ko.tsv", filename="multinli.train.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/snli_1.0_train.ko.tsv", filename="snli_1.0_train.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.dev.ko.tsv", filename="xnli.dev.ko.tsv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.test.ko.tsv", filename="xnli.test.ko.tsv")

train_snli = pd.read_csv("snli_1.0_train.ko.tsv", sep='\t', quoting=3)
train_xnli = pd.read_csv("multinli.train.ko.tsv", sep='\t', quoting=3)
val_data = pd.read_csv("xnli.dev.ko.tsv", sep='\t', quoting=3)
test_data = pd.read_csv("xnli.test.ko.tsv", sep='\t', quoting=3)

데이터 전처리

train_data = train_snli.append(train_xnli)
train_data = train_data.sample(frac=1)

def drop_na_and_duplciates(df):
  df = df.dropna()
  df = df.drop_duplicates()
  df = df.reset_index(drop=True)
  return df

train_data = drop_na_and_duplciates(train_data)
val_data = drop_na_and_duplciates(val_data)
test_data = drop_na_and_duplciates(test_data)

tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

max_seq_len = 128

def Preprocessing(sent_list1,sent_list2,max_seq_len,tokenizer):

  input_ids = []
  attention_masks = []
  token_type_ids = []

  for sent1, sent2 in tqdm(zip(sent_list1,sent_list2), total=len(sent_list1)):
    encoding_result = tokenizer.encode_plus(sent1,sent2,max_length=max_seq_len,pad_to_max_length=True)

    input_ids.append(encoding_result["input_ids"])
    attention_masks.append(encoding_result["attention_mask"])
    token_type_ids.append(encoding_result["token_type_ids"])

  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  token_type_ids = torch.tensor(token_type_ids)

  return (input_ids, attention_masks, token_type_ids)

X_train = Preprocessing(train_data["sentence1"],train_data["sentence2"],max_seq_len=max_seq_len,tokenizer=tokenizer)
X_val = Preprocessing(val_data["sentence1"],val_data["sentence2"],max_seq_len=max_seq_len,tokenizer=tokenizer)
X_test = Preprocessing(test_data["sentence1"],test_data["sentence2"],max_seq_len=max_seq_len,tokenizer=tokenizer)

train_label = train_data["gold_label"].tolist()
val_label = val_data["gold_label"].tolist()
test_label = test_data["gold_label"].tolist()

idx_encode = preprocessing.LabelEncoder()
idx_encode.fit(train_label)

y_train = idx_encode.transform(train_label)
y_val = idx_encode.transform(val_label)
y_test = idx_encode.transform(test_label)

label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
idx_label = {value : key for key ,value in label_idx.items()}

모델 구현

device = xm.xla_device()
model = BertForSequenceClassification.from_pretrained("klue/bert-base",num_labels=3).to(device)
optimizer = optim.Adam(model.parameters(),lr=5e-5)
criterion = nn.CrossEntropyLoss()
input_ids_train = X_train[0]
attention_masks_train = X_train[1]
token_type_ids_train = X_train[2]

input_ids_val = X_val[0]
attention_masks_val = X_val[1]
token_type_ids_val = X_val[2]

input_ids_test = X_test[0]
attention_masks_test = X_test[1]
token_type_ids_test = X_test[2]

y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)
y_test = torch.tensor(y_test)

train_tensor = TensorDataset(input_ids_train,attention_masks_train,token_type_ids_train,y_train)
val_tensor = TensorDataset(input_ids_val,attention_masks_val,token_type_ids_val,y_val)
test_tensor = TensorDataset(input_ids_test,attention_masks_test,token_type_ids_test,y_test)

train_loader = DataLoader(train_tensor,batch_size=128,shuffle=True,drop_last=True)
val_loader = DataLoader(val_tensor,batch_size=128,shuffle=False)
test_loader = DataLoader(test_tensor,batch_size=128,shuffle=False)

for epoch in range(2):
  avg_loss = 0
  for batch in train_loader:
    input_id = batch[0].to(device)
    attention_mask = batch[1].to(device)
    token_type_id = batch[2].to(device)
    target = batch[3].to(device)

    logits = model(input_id,attention_mask,token_type_ids = token_type_id,labels=target)
    loss = logits[0]

    optimizer.zero_grad()
    loss.backward()
    xm.optimizer_step(optimizer,barrier=True)

    avg_loss += loss / len(train_loader)

  print("epoch : {} 일때 loss : {}".format(epoch+1,avg_loss))

def test():
  model.eval()
  corrected = 0

  for batch in test_loader:
    input_id = batch[0].to(device)
    attention_mask = batch[1].to(device)
    token_type_id = batch[2].to(device)
    target = batch[3].to(device)

    logits = model(input_id,attention_mask,token_type_id)
    corrected += (logits[0].cpu().argmax(dim=-1) == target.cpu()).sum()

  print("테스트 데이터 정확도 : {:.2f}%".format(corrected / len(test_loader.dataset) * 100))

test()

Tags:

Categories:

Updated: