Ewha-Euron · LeeJungYeonn · Jan 6, 2025
diff --git a/Week15_복습과제_이정연.ipynb b/Week15_복습과제_이정연.ipynb
@@ -0,0 +1,354 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f2f5tsEbbovj"
+      },
+      "source": [
+        "# **1. 라이브러리 및 기본 설정**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "yWN4bwg4bovp",
+        "outputId": "80dab0bb-164d-4bc8-e597-d0d077541cc3"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "<torch._C.Generator at 0x7e6216fb3e10>"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 1
+        }
+      ],
+      "source": [
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "import torch.optim as optim\n",
+        "\n",
+        "torch.manual_seed(1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Qr6PEmIQbovs"
+      },
+      "source": [
+        "# **2. 임베딩**\n",
+        "- 데이터 임베딩에 대한 간단한 예제"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "iE8t5INObovt"
+      },
+      "outputs": [],
+      "source": [
+        "## 단어를 인덱스에 매핑하는 딕셔너리\n",
+        "word_to_ix = {\"hello\": 0, \"world\": 1}\n",
+        "\n",
+        "## 임베딩 레이어 생성(2개의 단어, 5차원 임베딩)\n",
+        "# nn.Embedding 사용\n",
+        "embeds = nn.Embedding(num_embeddings=2, embedding_dim=5)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_oJn7mJabovu",
+        "outputId": "2fb58f87-c038-45af-96f3-64f972976c94"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "tensor([[-0.8923, -0.0583, -0.1955, -0.9656,  0.4224]],\n",
+            "       grad_fn=<EmbeddingBackward0>)\n"
+          ]
+        }
+      ],
+      "source": [
+        "## 임베딩 벡터 생성\n",
+        "# 특정 단어에 대한 텐서를 생성하고 임베딩 레이어를 통해 벡터를 조회\n",
+        "lookup_tensor = torch.tensor([word_to_ix[\"hello\"]], dtype=torch.long)\n",
+        "hello_embed = embeds(lookup_tensor)\n",
+        "print(hello_embed) # \"hello\" 단어의 임베딩 벡터 출력"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WLAa3MkBbovu"
+      },
+      "source": [
+        "# **3. 데이터 준비**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "n53-lJPmbovv"
+      },
+      "outputs": [],
+      "source": [
+        "# 컨텍스트 크기/ 임베딩 차원 정의\n",
+        "CONTEXT_SIZE = 2\n",
+        "EMBEDDING_DIM = 10\n",
+        "\n",
+        "# Shakespeare 텍스트를 단어 단위로 분리하여 N-Gram 데이터로 저장\n",
+        "test_sentence = \"\"\"When forty winters shall besiege thy brow,\n",
+        "And dig deep trenches in thy beauty's field,\n",
+        "Thy youth's proud livery so gazed on now,\n",
+        "Will be a totter'd weed of small worth held:\n",
+        "Then being asked, where all thy beauty lies,\n",
+        "Where all the treasure of thy lusty days;\n",
+        "To say, within thine own deep sunken eyes,\n",
+        "Were an all-eating shame, and thriftless praise.\n",
+        "How much more praise deserv'd thy beauty's use,\n",
+        "If thou couldst answer 'This fair child of mine\n",
+        "Shall sum my count, and make my old excuse,'\n",
+        "Proving his beauty by succession thine!\n",
+        "This were to be new made when thou art old,\n",
+        "And see thy blood warm when thou feel'st it cold.\"\"\".split()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "GI5MebnIbovw",
+        "outputId": "bdfbd63c-3ae2-4183-f3ee-10e200f0a32b"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[(['forty', 'When'], 'winters'), (['winters', 'forty'], 'shall'), (['shall', 'winters'], 'besiege')]\n"
+          ]
+        }
+      ],
+      "source": [
+        "## N-Gram 데이터 생성\n",
+        "# 원래는 입력을 토큰화해야하지만 해당 예제에서는 튜플 리스트를 생성\n",
+        "# 각 튜플은 ([word_i-CONTEXT_SIZE, ..., word_i-1], 대상 단어) 형태\n",
+        "\n",
+        "ngrams = [\n",
+        "    (\n",
+        "        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)], # 컨텍스트 단어들\n",
+        "        test_sentence[i]\n",
+        "    )\n",
+        "    for i in range(CONTEXT_SIZE, len(test_sentence))\n",
+        "]\n",
+        "\n",
+        "# 처음 3개의 데이터 확인하기\n",
+        "print(ngrams[:3])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "6McWNAuebovx"
+      },
+      "outputs": [],
+      "source": [
+        "# 전체 텍스트에서 고유 단어를 추출하여 vocabulary map 생성\n",
+        "vocab = set(test_sentence)\n",
+        "word_to_ix = {word: i for i, word in enumerate(vocab)}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9SQK9ZIvbovx"
+      },
+      "source": [
+        "# **4. 모델링**\n",
+        "- n-gram 언어 모델 정의"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "id": "78I-XH3Pbovy"
+      },
+      "outputs": [],
+      "source": [
+        "## N-Gram 언어 모델 클래스 정의\n",
+        "# 임베딩 레이어와 두 개의 선형 계층(Linear)으로 구성된 N-Gram 언어 모델\n",
+        "class NGramLanguageModeler(nn.Module):\n",
+        "    def __init__(self, vocab_size, embedding_dim, context_size):\n",
+        "        super(NGramLanguageModeler, self).__init__()\n",
+        "        ## 임베딩 레이어 생성\n",
+        "        # 주어진 \"어휘 크기\"와 \"임베딩 차원\"을 기반으로 랜덤하게 초기화된 임베딩 행렬 생성\n",
+        "        self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
+        "\n",
+        "        ## 첫 번째 선형 계층\n",
+        "        # input_dim: context_size * embedding_dim, output_dim: 128\n",
+        "        self.linear1 = nn.Linear(context_size * embedding_dim, 128)\n",
+        "\n",
+        "        ## 두 번째 선형 계층\n",
+        "        # 첫 번째 선형 계층의 출력을 받아 어휘집 크기로 출력\n",
+        "        self.linear2 = nn.Linear(128, vocab_size)\n",
+        "\n",
+        "\n",
+        "    def forward(self, inputs):\n",
+        "        # 임베딩 레이어를 통과하여 입력 벡터를 펼침(flatten)\n",
+        "        embeds = self.embeddings(inputs).view((1, -1))\n",
+        "        # 첫 번째 선형 계층 통과 및 활성화 함수 적용\n",
+        "        out = F.relu(self.linear1(embeds))\n",
+        "        # 두 번째 선형 계층 통과(활성화 함수 적용 x)\n",
+        "        out = self.linear2(out)\n",
+        "        # 출력층: 로그 소프트맥스를 통해 proportion 계산\n",
+        "        log_probs = F.log_softmax(out, dim=1)\n",
+        "\n",
+        "        return log_probs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ejxykzwdbovy"
+      },
+      "source": [
+        "# **5. 학습**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "id": "Yhol6ZZmbovy"
+      },
+      "outputs": [],
+      "source": [
+        "## 손실 함수 및 최적화 함수(optimizer) 설정\n",
+        "losses = []\n",
+        "loss_function = nn.NLLLoss() # negative log-likelihood\n",
+        "model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE) # 모델 객체 생성\n",
+        "optimizer = optim.SGD(model.parameters(), lr=0.001) # 경사 하강법 활용"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "kAczRVV_bovz",
+        "outputId": "33be6883-e14c-44a4-e25e-37d1500f713a"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[519.8730397224426, 517.4654364585876, 515.0745615959167, 512.6986198425293, 510.33618927001953, 507.9869465827942, 505.649133682251, 503.32190561294556, 501.00630140304565, 498.70126819610596]\n"
+          ]
+        }
+      ],
+      "source": [
+        "for epoch in range(10):\n",
+        "    # 각 epoch에서의 total_loss 초기화\n",
+        "    total_loss = 0\n",
+        "\n",
+        "    # 각 N-Gram 데이터를 활용하여..\n",
+        "    for context, target in ngrams:\n",
+        "        # vocabulary mapping(컨텍스트 단어 → 정수 인덱스)\n",
+        "        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)\n",
+        "\n",
+        "        ## forward pass\n",
+        "        # 그래디언트 초기화\n",
+        "        model.zero_grad()\n",
+        "        # log probability 계산\n",
+        "        log_probs = model(context_idxs)\n",
+        "        # 손실 계산(대상 단어의 인덱스를 사용)\n",
+        "        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))\n",
+        "\n",
+        "        ## backward pass\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "        total_loss += loss.item() # 현재 손실을 총 손실에 추가\n",
+        "    # 각 에포크의 손실 저장\n",
+        "    losses.append(total_loss)\n",
+        "print(losses)  # 에포크 별 손실 출력 (훈련이 진행됨에 따라 감소)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "qi0e0VIMbovz",
+        "outputId": "32a4093f-e471-4f37-d495-c6b17677b182"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "tensor([ 0.8419,  0.7298, -0.0714,  0.1549,  0.1786,  1.4526, -1.1712, -2.2411,\n",
+            "        -1.3467,  2.8665], grad_fn=<SelectBackward0>)\n"
+          ]
+        }
+      ],
+      "source": [
+        "## 학습된 임베딩 확인\n",
+        "# 특정 단어의 임베딩 결과 확인(예: \"beauty\")\n",
+        "print(model.embeddings.weight[word_to_ix[\"beauty\"]])"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.8"
+    },
+    "colab": {
+      "provenance": []
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}