diff --git "a/Week 16_\354\230\210\354\212\265\352\263\274\354\240\234_\353\254\270\354\233\220\354\240\225.ipynb" "b/Week 16_\354\230\210\354\212\265\352\263\274\354\240\234_\353\254\270\354\233\220\354\240\225.ipynb" new file mode 100644 index 0000000..2e16e06 --- /dev/null +++ "b/Week 16_\354\230\210\354\212\265\352\263\274\354\240\234_\353\254\270\354\233\220\354\240\225.ipynb" @@ -0,0 +1,831 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "e14558a2", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "534b48b6df5241d8ab12ea1ef1e337ef", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/972k [00:006,}'.format(tup[0], tup[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9da9f047", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n" + ] + } + ], + "source": [ + "segments_ids = [1] * len(tokenized_text)\n", + "print (segments_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1b9a81c0", + "metadata": {}, + "outputs": [], + "source": [ + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "segments_tensors = torch.tensor([segments_ids])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2637806e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b75c2ed4c0e745169544118ef8889ad5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/681M [00:00\n", + "각 계층에서의 텐서 형태: torch.Size([1, 33, 768])\n" + ] + } + ], + "source": [ + "print('은닉 상태의 유형: ', type(hidden_states))\n", + "print('각 계층에서의 텐서 형태: ', hidden_states[0].size())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9633fe69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([13, 1, 33, 768])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token_embeddings = torch.stack(hidden_states, dim=0)\n", + "token_embeddings.size()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5d777453", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([13, 33, 768])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token_embeddings = torch.squeeze(token_embeddings, dim=1)\n", + "token_embeddings.size()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "aa6a4d37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([33, 13, 768])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token_embeddings = token_embeddings.permute(1,0,2)\n", + "token_embeddings.size()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b2f7874", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "형태는: 33 x 3072\n" + ] + } + ], + "source": [ + "token_vecs_cat = []\n", + "for token in token_embeddings:\n", + " cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)\n", + " token_vecs_cat.append(cat_vec)\n", + "print ('형태는: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7621bc1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "형태는: 33 x 768\n" + ] + } + ], + "source": [ + "token_vecs_sum = []\n", + "for token in token_embeddings:\n", + " sum_vec = torch.sum(token[-4:], dim=0)\n", + " token_vecs_sum.append(sum_vec)\n", + "print ('형태는: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2a14ae7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "최종 임베딩 벡터의 형태: torch.Size([768])\n" + ] + } + ], + "source": [ + "token_vecs = hidden_states[-2][0]\n", + "sentence_embedding = torch.mean(token_vecs, dim=0)\n", + "print (\"최종 임베딩 벡터의 형태:\", sentence_embedding.size())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a0b581d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 [CLS]\n", + "1 과\n", + "2 ##수\n", + "3 ##원에\n", + "4 사\n", + "5 ##과\n", + "6 ##가\n", + "7 많\n", + "8 ##았다\n", + "9 .\n", + "10 친\n", + "11 ##구\n", + "12 ##가\n", + "13 나\n", + "14 ##에게\n", + "15 사\n", + "16 ##과\n", + "17 ##했다\n", + "18 .\n", + "19 백\n", + "20 ##설\n", + "21 ##공\n", + "22 ##주는\n", + "23 독\n", + "24 ##이\n", + "25 든\n", + "26 사\n", + "27 ##과\n", + "28 ##를\n", + "29 먹\n", + "30 ##었다\n", + "31 .\n", + "32 [SEP]\n" + ] + } + ], + "source": [ + "for i, token_str in enumerate(tokenized_text):\n", + " print (i, token_str)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "eb56c9a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "사과가 많았다 tensor([-0.5844, -4.0836, 0.4906, 0.8915, -1.8054])\n", + "나에게 사과했다 tensor([-0.8631, -3.4047, -0.7351, 0.9805, -2.6700])\n", + "사과를 먹었다 tensor([ 0.6756, -0.3618, 0.0586, 2.2050, -2.4193])\n" + ] + } + ], + "source": [ + "print(\"사과가 많았다\", str(token_vecs_sum[6][:5]))\n", + "print(\"나에게 사과했다\", str(token_vecs_sum[10][:5]))\n", + "print(\"사과를 먹었다\", str(token_vecs_sum[19][:5]))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "dbc2e65c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*유사한* 의미에 대한 벡터 유사성: 0.86\n", + "*다른* 의미에 대한 벡터 유사성: 0.91\n" + ] + } + ], + "source": [ + "from scipy.spatial.distance import cosine\n", + "diff_apple = 1 - cosine(token_vecs_sum[5], token_vecs_sum[27])\n", + "same_apple = 1 - cosine(token_vecs_sum[5], token_vecs_sum[16])\n", + "print('*유사한* 의미에 대한 벡터 유사성: %.2f' % same_apple)\n", + "print('*다른* 의미에 대한 벡터 유사성: %.2f' % diff_apple)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git "a/Week 16_\354\230\210\354\212\265\352\263\274\354\240\234_\353\254\270\354\233\220\354\240\225.pdf" "b/Week 16_\354\230\210\354\212\265\352\263\274\354\240\234_\353\254\270\354\233\220\354\240\225.pdf" new file mode 100644 index 0000000..303071a Binary files /dev/null and "b/Week 16_\354\230\210\354\212\265\352\263\274\354\240\234_\353\254\270\354\233\220\354\240\225.pdf" differ