Named Entity Recogition

Author

차상진

Published

May 3, 2025

NER

NER(Named Entity Recogition, 개체명 인식) : NLP에서 특정 단어(토큰)가 사람,장소,조직,날짜,숫자,제품명 등 어떤 개체(entity)인지 인식하는 Task이다.

NER작업을 위해 XLM-RoBERTa의 바디와 헤드를 가져와서 미세튜닝을 하는 중이다.

0. Imports

import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn

from datasets import load_dataset, get_dataset_config_names

from transformers import (
    AutoTokenizer,
    XLMRobertaConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta import RobertaModel, RobertaPreTrainedModel

from seqeval.metrics import (classification_report,f1_score)

from huggingface_hub import notebook_login

1. Load data

# xtreme데이터셋 중 PAN-X.de에 해당하는 데이터를 불러오기
xtreme_subsets = get_dataset_config_names("xtreme")
load_dataset("xtreme", name="PAN-X.de")

/root/anaconda3/envs/nlp/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

- 데이터를 dict에 언어 비율별로 섞어서 저장

스위스는 다국어를 사용하므로 스위스 환경을 반영하여 학습을 더 현실적이고 일반적으로 하기 위해서 섞어서 저장한다.

from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059] # 사실적인 스위스 말뭉치를 만들기 위해 스위스에서 사용되는 언어비율로 말뭉치를 sampling
# 키가 없는 경우 DatasetDict를 반환합니다.
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # 다국어 말뭉치를 로드합니다.
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # 각 분할을 언어 비율에 따라 다운샘플링하고 섞습니다.
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows))))

- 만들어진 dict

element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']

for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

- ner_tags라는 숫자 리스트 -> 문자 리스트로 변환

tags.int2str(0) = "O", tags.int2str(3) = "B-LOC" 와 같이 숫자 리스트를 사람이 읽을 수 있는 문자로 변환

def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)

de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]],['Tokens', 'Tags'])

	0	1	2	3	4	5	6	7	8	9	10	11
Tokens	2.000	Einwohnern	an	der	Danziger	Bucht	in	der	polnischen	Woiwodschaft	Pommern	.
Tags	O	O	O	O	B-LOC	I-LOC	O	O	B-LOC	B-LOC	I-LOC	O

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")
# 결과가 균형있게 부여되었기에 NER을 시험하기에 적절한 데이터이다.

	LOC	ORG	PER
train	6186	5366	5810
validation	3172	2683	2893
test	3180	2573	3071

2. XLM-R 토큰화

- model, tokenizer

데이터가 준비되었으니 model과 tokenizer 선언

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

df = pd.DataFrame([bert_tokens, xlmr_tokens], index=["BERT", "XLM-R"])
df

	0	1	2	3	4	5	6	7	8	9
BERT	[CLS]	Jack	Spa	##rrow	loves	New	York	!	[SEP]	None
XLM-R	<s>	▁Jack	▁Spar	row	▁love	s	▁New	▁York	!	</s>

- 유니코드 문자 U+2581을 공백 문자로 바꾸는 것

"".join(xlmr_tokens).replace(u"\u2581", " ")

'<s> Jack Sparrow loves New York!</s>'

2-1. 헤드 정의

- 토큰 분류를 위해 헤드를 사용자 정의 모델 만들기

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    # model의 config를 설정
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # RobertaModel의 본체를 불러오는 부분
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 토큰 분류 헤드를 준비
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # 가중치를 로드하고 초기화
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None):
        # 모델 바디를 사용해 인코더 값을 가진다.
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids)
        # 인코더 표현을 헤드에 통과
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # label이 주어졌을 때만 loss를 계산한다.
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # 모델 출력 객체를 반환합니다.
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

- model config 설정

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)

- 모델 설정

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

- xlmr 토크나이저로 토큰화

input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

	0	1	2	3	4	5	6	7	8	9
Tokens	<s>	▁Jack	▁Spar	row	▁love	s	▁New	▁York	!	</s>
Input IDs	0	21763	37456	15555	5161	7	2356	5753	38	2

- 토큰화한 input을 model에 넣고 logits 계산

outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"시퀀스에 있는 토큰 개수: {len(xlmr_tokens)}")
print(f"출력 크기: {outputs.shape}")

시퀀스에 있는 토큰 개수: 10
출력 크기: torch.Size([1, 10, 7])

- logits로 tag 추정

preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

	0	1	2	3	4	5	6	7	8	9
Tokens	<s>	▁Jack	▁Spar	row	▁love	s	▁New	▁York	!	</s>
Tags	I-ORG	I-ORG	I-ORG	I-ORG	I-ORG	I-ORG	I-ORG	I-ORG	I-ORG	I-ORG

바디와 헤드를 불러왔지만 헤드는 가중치가 랜덤이기에 fine tuning을 해야함

Fine tuning 전처리

모델을 훈련하기 전에 입력을 토큰화하고 레이블을 준비하자.

앞의 과정을 함수로 구현해놓자

def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device) # 토큰화
    outputs = model(input_ids)[0] # 토큰화한 input을 model에 입력
    predictions = torch.argmax(outputs, dim=2) # 추정
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()] # 보기 좋게 데이터프레임으로 변환
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

- 학습을 위한 words, labels 준비

words, labels = de_example["tokens"], de_example["ner_tags"]

- words -> tokens

tokenized_input = xlmr_tokenizer(de_example['tokens'],is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"]) # 토큰의 ID 리스트를 실제 토큰 문자열로 변환하는 함수

is_split_into_words=True로 설정해야 토크나이저가 단어 단위로 처리됨

Q.왜 단어들을 정수 리스트로 바꾸고 또 다시 문자열로 변환하는가?

A.원래 단어(입력)와 토큰화된 단어(출력)을 비교하기 위해서이다. 어떤 방식으로 토큰화가 되는지 확인할 수 있다. 그리고 모델의 예측 결과를 해석하기 위해서이다. 실제 모델에 입력할 때는 여전히 정수 형태를 사용한다.

- 보기 좋게 데이터프레임으로 변환

pd.DataFrame([tokens], index=["Tokens"])

	0	1	2	3	4	5	6	7	8	9	...	15	16	17	18	19	20	21	22	23	24
Tokens	<s>	▁2.000	▁Einwohner	n	▁an	▁der	▁Dan	zi	ger	▁Buch	...	▁Wo	i	wod	schaft	▁Po	mmer	n	▁	.	</s>

1 rows × 25 columns

- 각 토큰이 원래 입력된 단어 리스트에서 몇 번째 단어에 해당하는지 인덱스로 반환

word_ids = tokenized_input.word_ids() # 토큰화된 토큰들이 원래 어떤 단어에서 왔는지 알려주는 함수.
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

	0	1	2	3	4	5	6	7	8	9	...	15	16	17	18	19	20	21	22	23	24
Tokens	<s>	▁2.000	▁Einwohner	n	▁an	▁der	▁Dan	zi	ger	▁Buch	...	▁Wo	i	wod	schaft	▁Po	mmer	n	▁	.	</s>
Word IDs	None	0	1	1	2	3	4	4	4	5	...	9	9	9	9	10	10	10	11	11	None

2 rows × 25 columns

WordIDS가 같다는 것은 하나의 단어라는 의미이다.

- 학습 전 subword 토큰을 masking 처리하는 과정

previous_word_idx = None
label_ids = []

# masking을 위해서 -100으로 설정
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx: # 같은 단어가 연속, padding token이거나 특수토큰인 경우
        label_ids.append(-100) # 무시하자
    elif word_idx != previous_word_idx: # 현재토큰이 단어의 첫 subword가 아닌 경우
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

- 데이터프레임으로 변환

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

	0	1	2	3	4	5	6	7	8	9	...	15	16	17	18	19	20	21	22	23	24
Tokens	<s>	▁2.000	▁Einwohner	n	▁an	▁der	▁Dan	zi	ger	▁Buch	...	▁Wo	i	wod	schaft	▁Po	mmer	n	▁	.	</s>
Word IDs	None	0	1	1	2	3	4	4	4	5	...	9	9	9	9	10	10	10	11	11	None
Label IDs	-100	0	0	-100	0	0	5	-100	-100	6	...	5	-100	-100	-100	6	-100	-100	0	-100	-100
Labels	IGN	O	O	IGN	O	O	B-LOC	IGN	IGN	I-LOC	...	B-LOC	IGN	IGN	IGN	I-LOC	IGN	IGN	O	IGN	IGN

4 rows × 25 columns

Q. 왜 -100인가요?

A. torch.nn.CrossEntropyLoss의 ignore_index 속성값이 -100이기 때문이다. 그래서 훈련시 이 인덱스는 무시된다.

- 토큰화 -> subword masking 과정을 함수화

def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True) # 토큰화
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids: # subword 마스킹 처리
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

- tokenize_and_align_labels 함수를 배치 단위로 적용

def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['langs', 'ner_tags', 'tokens'])

panx_de_encoded = encode_panx_dataset(panx_ch["de"])

Map: 100%|██████████| 6290/6290 [00:00<00:00, 17222.82 examples/s]

panx_de_encoded['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12580
})

Fine tuning 학습

- 평가 방식 선언

y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2

- 예측 결과를 문자 label로 바꾸는 함수

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # 레이블 IDs = -100 무시
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

학습

num_epochs = 3
batch_size = 16
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
    output_dir=model_name,
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=1e6, 
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps, 
    report_to="none")

- 평가 지표 설정

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

- DataCollator 선언

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

- 필요할 때 model을 초기화하기 위한 함수

def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

trainer = Trainer(model_init=model_init, 
                  args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded["train"],
                  eval_dataset=panx_de_encoded["validation"],
                  tokenizer=xlmr_tokenizer)

trainer.train()

/tmp/ipykernel_19299/1497450380.py:1: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(model_init=model_init,

[2361/2361 06:35, Epoch 3/3]

Step	Training Loss
786	0.252300
1572	0.126200
2358	0.079100

TrainOutput(global_step=2361, training_loss=0.15248227698830213, metrics={'train_runtime': 395.745, 'train_samples_per_second': 95.364, 'train_steps_per_second': 5.966, 'total_flos': 792244708985400.0, 'train_loss': 0.15248227698830213, 'epoch': 3.0})

Fine tuning 완료!