{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b2cd16ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration\n",
    "import torch\n",
    "import re\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d0f73fea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=> Using device: cuda:0 | GPU: NVIDIA GeForce RTX 4060\n"
     ]
    }
   ],
   "source": [
    "if not torch.cuda.is_available():\n",
    "    raise RuntimeError(\n",
    "        \"CUDA가 비활성화되어 있습니다. \"\n",
    "        \"1) NVIDIA 드라이버 갱신, 2) CUDA 휠(torch cu121 등) 재설치, \"\n",
    "        \"3) 커널 재시작 후 다시 실행하세요.\"\n",
    "    )\n",
    "\n",
    "device = torch.device(\"cuda:0\")\n",
    "cudnn.benchmark = True\n",
    "torch.set_grad_enabled(False)\n",
    "print(\"\\n=> Using device:\", device, \"| GPU:\", torch.cuda.get_device_name(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "82315896",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ----- 데이터 불러오기 -----\n",
    "url = \"https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt\"\n",
    "df = pd.read_csv(url, delimiter=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "317759dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocessor(text):\n",
    "    if pd.isnull(text):\n",
    "        return \"\"\n",
    "    text = text.lower()\n",
    "    text = re.sub(r\"[^가-힣a-zA-Z\\s.?!,]\", \" \", text)\n",
    "    text = re.sub(r\"[.?!,]\", \" \", text)\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
    "    return text\n",
    "\n",
    "df[\"clean_document\"] = df[\"document\"].apply(preprocessor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fbb37dc4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You passed `num_labels=3` which is incompatible to the `id2label` map of length `2`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model device: cuda:0\n"
     ]
    }
   ],
   "source": [
    "tokenizer = PreTrainedTokenizerFast.from_pretrained(\"Soyoung97/gec_kr\")\n",
    "\n",
    "model = BartForConditionalGeneration.from_pretrained(\"Soyoung97/gec_kr\")\n",
    "model = model.half().to(device)  # FP16 + CUDA\n",
    "model.eval()\n",
    "\n",
    "# 확인: 모델 파라미터가 실제로 GPU에 있는지\n",
    "print(\"model device:\", next(model.parameters()).device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "89958bea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ----- 전처리 -----\n",
    "def preprocessor(text):\n",
    "    if pd.isnull(text):\n",
    "        return \"\"\n",
    "    text = text.lower()\n",
    "    text = re.sub(r\"[^가-힣a-zA-Z\\s.?!,]\", \" \", text)\n",
    "    text = re.sub(r\"[.?!,]\", \" \", text)\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
    "    return text\n",
    "\n",
    "df[\"clean_document\"] = df[\"document\"].apply(preprocessor)\n",
    "\n",
    "# ----- 배치 교정 함수 (CUDA 대응) -----\n",
    "@torch.no_grad()\n",
    "def gec_correct_batch(texts, max_length=128, num_beams=4, repetition_penalty=2.0):\n",
    "    mask_empty = [not t.strip() for t in texts]\n",
    "    safe_texts = [\".\" if me else t for t, me in zip(texts, mask_empty)]\n",
    "\n",
    "    enc = tokenizer(\n",
    "        safe_texts,\n",
    "        return_tensors=\"pt\",\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        max_length=max_length,\n",
    "    )\n",
    "\n",
    "    enc = {k: v.to(device) for k, v in enc.items()}\n",
    "\n",
    "    out = model.generate(\n",
    "        **enc,\n",
    "        max_length=max_length,\n",
    "        num_beams=num_beams,\n",
    "        early_stopping=True,\n",
    "        eos_token_id=tokenizer.eos_token_id,\n",
    "        repetition_penalty=repetition_penalty,\n",
    "    )\n",
    "\n",
    "    dec = tokenizer.batch_decode(out, skip_special_tokens=True)\n",
    "\n",
    "    for i, is_empty in enumerate(mask_empty):\n",
    "        if is_empty:\n",
    "            dec[i] = \"\"\n",
    "    return dec\n",
    "\n",
    "# ----- 전체 데이터 배치 처리 -----\n",
    "BATCH = 32  # VRAM 여유 시 64 이상으로 조정\n",
    "rows = df[\"clean_document\"].tolist()\n",
    "preds = []\n",
    "for i in range(0, len(rows), BATCH):\n",
    "    batch = rows[i : i + BATCH]\n",
    "    preds.extend(gec_correct_batch(batch))\n",
    "\n",
    "df[\"gec_document\"] = preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "fc9a369b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>document</th>\n",
       "      <th>label</th>\n",
       "      <th>clean_document</th>\n",
       "      <th>gec_document</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9976970</td>\n",
       "      <td>아 더빙.. 진짜 짜증나네요 목소리</td>\n",
       "      <td>0</td>\n",
       "      <td>아 더빙 진짜 짜증나네요 목소리</td>\n",
       "      <td>아 더빙 진짜 짜증나네요 목소리</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3819312</td>\n",
       "      <td>흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나</td>\n",
       "      <td>1</td>\n",
       "      <td>흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나</td>\n",
       "      <td>흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10265843</td>\n",
       "      <td>너무재밓었다그래서보는것을추천한다</td>\n",
       "      <td>0</td>\n",
       "      <td>너무재밓었다그래서보는것을추천한다</td>\n",
       "      <td>너무재밌었다그래서보는것을추천한다</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9045019</td>\n",
       "      <td>교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정</td>\n",
       "      <td>0</td>\n",
       "      <td>교도소 이야기구먼 솔직히 재미는 없다 평점 조정</td>\n",
       "      <td>교도소 이야기구먼 솔직히 재미는 없다 평점 조정</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6483659</td>\n",
       "      <td>사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...</td>\n",
       "      <td>1</td>\n",
       "      <td>사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...</td>\n",
       "      <td>사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149995</th>\n",
       "      <td>6222902</td>\n",
       "      <td>인간이 문제지.. 소는 뭔죄인가..</td>\n",
       "      <td>0</td>\n",
       "      <td>인간이 문제지 소는 뭔죄인가</td>\n",
       "      <td>인간이 문제지 소는 무슨 죄인가</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149996</th>\n",
       "      <td>8549745</td>\n",
       "      <td>평점이 너무 낮아서...</td>\n",
       "      <td>1</td>\n",
       "      <td>평점이 너무 낮아서</td>\n",
       "      <td>평점이 너무 낮아서</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149997</th>\n",
       "      <td>9311800</td>\n",
       "      <td>이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?</td>\n",
       "      <td>0</td>\n",
       "      <td>이게 뭐요 한국인은 거들먹거리고 필리핀 혼혈은 착하다</td>\n",
       "      <td>이게 뭐요 한국인은 거들먹거리고 필리핀은 착하다</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149998</th>\n",
       "      <td>2376369</td>\n",
       "      <td>청춘 영화의 최고봉.방황과 우울했던 날들의 자화상</td>\n",
       "      <td>1</td>\n",
       "      <td>청춘 영화의 최고봉 방황과 우울했던 날들의 자화상</td>\n",
       "      <td>청춘 영화의 최고봉 방황과 우울했던 날들의 자화상</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149999</th>\n",
       "      <td>9619869</td>\n",
       "      <td>한국 영화 최초로 수간하는 내용이 담긴 영화</td>\n",
       "      <td>0</td>\n",
       "      <td>한국 영화 최초로 수간하는 내용이 담긴 영화</td>\n",
       "      <td>한국 영화 최초로 편집하는 내용이 담긴 영화</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>150000 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                                           document  label  \\\n",
       "0        9976970                                아 더빙.. 진짜 짜증나네요 목소리      0   \n",
       "1        3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1   \n",
       "2       10265843                                  너무재밓었다그래서보는것을추천한다      0   \n",
       "3        9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0   \n",
       "4        6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1   \n",
       "...          ...                                                ...    ...   \n",
       "149995   6222902                                인간이 문제지.. 소는 뭔죄인가..      0   \n",
       "149996   8549745                                      평점이 너무 낮아서...      1   \n",
       "149997   9311800                    이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?      0   \n",
       "149998   2376369                        청춘 영화의 최고봉.방황과 우울했던 날들의 자화상      1   \n",
       "149999   9619869                           한국 영화 최초로 수간하는 내용이 담긴 영화      0   \n",
       "\n",
       "                                           clean_document  \\\n",
       "0                                       아 더빙 진짜 짜증나네요 목소리   \n",
       "1                            흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나   \n",
       "2                                       너무재밓었다그래서보는것을추천한다   \n",
       "3                              교도소 이야기구먼 솔직히 재미는 없다 평점 조정   \n",
       "4       사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...   \n",
       "...                                                   ...   \n",
       "149995                                    인간이 문제지 소는 뭔죄인가   \n",
       "149996                                         평점이 너무 낮아서   \n",
       "149997                      이게 뭐요 한국인은 거들먹거리고 필리핀 혼혈은 착하다   \n",
       "149998                        청춘 영화의 최고봉 방황과 우울했던 날들의 자화상   \n",
       "149999                           한국 영화 최초로 수간하는 내용이 담긴 영화   \n",
       "\n",
       "                                             gec_document  \n",
       "0                                       아 더빙 진짜 짜증나네요 목소리  \n",
       "1                            흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나  \n",
       "2                                       너무재밌었다그래서보는것을추천한다  \n",
       "3                              교도소 이야기구먼 솔직히 재미는 없다 평점 조정  \n",
       "4       사이몬페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어보이기만 했던 커스틴 ...  \n",
       "...                                                   ...  \n",
       "149995                                  인간이 문제지 소는 무슨 죄인가  \n",
       "149996                                         평점이 너무 낮아서  \n",
       "149997                         이게 뭐요 한국인은 거들먹거리고 필리핀은 착하다  \n",
       "149998                        청춘 영화의 최고봉 방황과 우울했던 날들의 자화상  \n",
       "149999                           한국 영화 최초로 편집하는 내용이 담긴 영화  \n",
       "\n",
       "[150000 rows x 5 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['gec_document'] = df['gec_document'].apply(preprocessor)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f39a59ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "from mecab import MeCab\n",
    "mecab = MeCab()\n",
    "morphs = mecab.morphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "b9584b49",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenizer(text):\n",
    "    if pd.isnull(text):\n",
    "        return []\n",
    "    return mecab.morphs(text)\n",
    "\n",
    "# 적용\n",
    "df['clean_tokenized'] = df['clean_document'].apply(tokenizer)\n",
    "df['gec_tokenized'] = df['gec_document'].apply(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "a308c054",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['아', '더', '빙', '진짜', '짜증', '나', '네요', '목소리', '흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍', '지', '않', '구나', '너무', '재', '밓었다그래서보는것을추천한다', '교도소', '이야기', '구먼', '솔직히', '재미', '는', '없', '다', '평점', '조정', '사이몬페그', '의', '익살', '스런', '연기', '가', '돋보였', '던', '영화', '스파이더맨', '에서', '늙', '어', '보이', '기', '만']\n",
      "전체 토큰 2406806 개\n"
     ]
    }
   ],
   "source": [
    "clean_tokens = []  # 토큰을 모을 리스트\n",
    "\n",
    "for text in df['clean_document']:\n",
    "    if pd.isnull(text):\n",
    "        continue\n",
    "    clean_tokens.extend(mecab.morphs(text))  # extend로 리스트에 합치기\n",
    "\n",
    "print(clean_tokens[:50])  # 앞부분 50개만 확인\n",
    "print(\"전체 토큰\",len(clean_tokens),\"개\")  # 전체 토큰 개수 확인"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "67f592ee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['아', '더', '빙', '진짜', '짜증', '나', '네요', '목소리', '흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍', '지', '않', '구나', '너무', '재밌', '었', '다', '그래서', '보', '는', '것', '을', '추천', '한다', '교도소', '이야기', '구먼', '솔직히', '재미', '는', '없', '다', '평점', '조정', '사이몬페그', '의', '익살', '스런', '연기', '가', '돋보였', '던']\n",
      "전체 토큰 2422958 개\n"
     ]
    }
   ],
   "source": [
    "gec_tokens = []  # 토큰을 모을 리스트\n",
    "\n",
    "for text in df['gec_document']:\n",
    "    if pd.isnull(text):\n",
    "        continue\n",
    "    gec_tokens.extend(mecab.morphs(text))  # extend로 리스트에 합치기\n",
    "\n",
    "print(gec_tokens[:50])  # 앞부분 50개만 확인\n",
    "print(\"전체 토큰\",len(gec_tokens),\"개\")  # 전체 토큰 개수 확인"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "0dfae3cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['아', '더', '빙', '진짜', '짜증', '나', '네요', '목소리', '흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍', '지', '않']\n",
      "고유 토큰 수: 49171\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "# 1. 토큰 빈도수 세기\n",
    "clean_counter = Counter(clean_tokens)\n",
    "\n",
    "\n",
    "# 2. 결과 확인\n",
    "print(clean_tokens[:20])  # 상위 20개만 확인\n",
    "print(\"고유 토큰 수:\", len(clean_counter))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "6028dc87",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['아', '더', '빙', '진짜', '짜증', '나', '네요', '목소리', '흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍', '지', '않']\n",
      "고유 토큰 수: 47542\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "# 1. 토큰 빈도수 세기\n",
    "gec_counter = Counter(gec_tokens)\n",
    "\n",
    "\n",
    "# 2. 결과 확인\n",
    "print(gec_tokens[:20])  # 상위 20개만 확인\n",
    "print(\"고유 토큰 수:\", len(gec_counter))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11e9b9e4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}