{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NT8dotfq-nS5"
   },
   "source": [
    "# **Ejercicio 3: Desarrollar y evaluar un Chunker utilizando el corpus ConLL2000 del NLTK**\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **1. Procesamiento del corpus conll2000.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 1128,
     "status": "ok",
     "timestamp": 1665076933494,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "PZkTLjRB-yOQ",
    "outputId": "f3091bac-29f2-4280-b708-6b98e040162a"
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "#nltk.download('conll2000')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **Cargar el corpus**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 1288,
     "status": "ok",
     "timestamp": 1665077050767,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "gE2ONYeaUih6",
    "outputId": "166e664a-38e9-436e-81de-0c15bf5fd757"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train lenght: 8936\n",
      "test lenght: 2012\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from nltk.corpus import conll2000\n",
    "conll_train = conll2000.chunked_sents('train.txt')\n",
    "conll_test = conll2000.chunked_sents('test.txt')\n",
    "print(\"train lenght: {}\".format(len(conll_train)))\n",
    "print(\"test lenght: {}\".format(len(conll_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 598
    },
    "executionInfo": {
     "elapsed": 576,
     "status": "ok",
     "timestamp": 1665077140784,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "RGUfAxUWU64B",
    "outputId": "4cbd0e7d-ee98-4e42-e04c-7221a389f229"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP Confidence/NN)\n",
      "  (PP in/IN)\n",
      "  (NP the/DT pound/NN)\n",
      "  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)\n",
      "  (NP another/DT sharp/JJ dive/NN)\n",
      "  if/IN\n",
      "  (NP trade/NN figures/NNS)\n",
      "  (PP for/IN)\n",
      "  (NP September/NNP)\n",
      "  ,/,\n",
      "  due/JJ\n",
      "  (PP for/IN)\n",
      "  (NP release/NN)\n",
      "  (NP tomorrow/NN)\n",
      "  ,/,\n",
      "  (VP fail/VB to/TO show/VB)\n",
      "  (NP a/DT substantial/JJ improvement/NN)\n",
      "  (PP from/IN)\n",
      "  (NP July/NNP and/CC August/NNP)\n",
      "  (NP 's/POS near-record/JJ deficits/NNS)\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "print(conll_train[0])\n",
    "#conll_train[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **Conversión de árboles a los chunks en formato IOB**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 1800,
     "status": "ok",
     "timestamp": 1665077389956,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "VApxMp3dVz6y",
    "outputId": "05d9d84c-548f-410b-aa8a-d768711629b2"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Confidence', 'NN', 'B-NP'), ('in', 'IN', 'B-PP'), ('the', 'DT', 'B-NP'), ('pound', 'NN', 'I-NP'), ('is', 'VBZ', 'B-VP'), ('widely', 'RB', 'I-VP'), ('expected', 'VBN', 'I-VP'), ('to', 'TO', 'I-VP'), ('take', 'VB', 'I-VP'), ('another', 'DT', 'B-NP'), ('sharp', 'JJ', 'I-NP'), ('dive', 'NN', 'I-NP'), ('if', 'IN', 'O'), ('trade', 'NN', 'B-NP'), ('figures', 'NNS', 'I-NP'), ('for', 'IN', 'B-PP'), ('September', 'NNP', 'B-NP'), (',', ',', 'O'), ('due', 'JJ', 'O'), ('for', 'IN', 'B-PP'), ('release', 'NN', 'B-NP'), ('tomorrow', 'NN', 'B-NP'), (',', ',', 'O'), ('fail', 'VB', 'B-VP'), ('to', 'TO', 'I-VP'), ('show', 'VB', 'I-VP'), ('a', 'DT', 'B-NP'), ('substantial', 'JJ', 'I-NP'), ('improvement', 'NN', 'I-NP'), ('from', 'IN', 'B-PP'), ('July', 'NNP', 'B-NP'), ('and', 'CC', 'I-NP'), ('August', 'NNP', 'I-NP'), (\"'s\", 'POS', 'B-NP'), ('near-record', 'JJ', 'I-NP'), ('deficits', 'NNS', 'I-NP'), ('.', '.', 'O')]\n"
     ]
    }
   ],
   "source": [
    "import nltk.chunk\n",
    "train_chunks = [nltk.chunk.tree2conlltags(tree) for tree in conll_train]\n",
    "test_chunks= [nltk.chunk.tree2conlltags(tree) for tree in conll_test]\n",
    "print(train_chunks[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **Preparar el conjunto de datos para entrenar un etiquetador de categorías gramaticales en chunks en formato IOB**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 352,
     "status": "ok",
     "timestamp": 1665077440929,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "EyRYGtaA-nS9",
    "outputId": "1699df5a-3c82-4411-c072-f319040c1a94"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('NN', 'B-NP'), ('IN', 'B-PP'), ('DT', 'B-NP'), ('NN', 'I-NP'), ('VBZ', 'B-VP'), ('RB', 'I-VP'), ('VBN', 'I-VP'), ('TO', 'I-VP'), ('VB', 'I-VP'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NN', 'B-NP'), ('NNS', 'I-NP'), ('IN', 'B-PP'), ('NNP', 'B-NP'), (',', 'O'), ('JJ', 'O'), ('IN', 'B-PP'), ('NN', 'B-NP'), ('NN', 'B-NP'), (',', 'O'), ('VB', 'B-VP'), ('TO', 'I-VP'), ('VB', 'I-VP'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'B-PP'), ('NNP', 'B-NP'), ('CC', 'I-NP'), ('NNP', 'I-NP'), ('POS', 'B-NP'), ('JJ', 'I-NP'), ('NNS', 'I-NP'), ('.', 'O')]\n",
      "[('Confidence', 'B-NP'), ('in', 'B-PP'), ('the', 'B-NP'), ('pound', 'I-NP'), ('is', 'B-VP'), ('widely', 'I-VP'), ('expected', 'I-VP'), ('to', 'I-VP'), ('take', 'I-VP'), ('another', 'B-NP'), ('sharp', 'I-NP'), ('dive', 'I-NP'), ('if', 'O'), ('trade', 'B-NP'), ('figures', 'I-NP'), ('for', 'B-PP'), ('September', 'B-NP'), (',', 'O'), ('due', 'O'), ('for', 'B-PP'), ('release', 'B-NP'), ('tomorrow', 'B-NP'), (',', 'O'), ('fail', 'B-VP'), ('to', 'I-VP'), ('show', 'I-VP'), ('a', 'B-NP'), ('substantial', 'I-NP'), ('improvement', 'I-NP'), ('from', 'B-PP'), ('July', 'B-NP'), ('and', 'I-NP'), ('August', 'I-NP'), (\"'s\", 'B-NP'), ('near-record', 'I-NP'), ('deficits', 'I-NP'), ('.', 'O')]\n"
     ]
    }
   ],
   "source": [
    "train=[[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in train_chunks]\n",
    "test= [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in test_chunks]\n",
    "print(train[0])\n",
    "\n",
    "train1=[[(w, c) for (w, t, c) in chunk_tags] for chunk_tags in train_chunks]\n",
    "test1= [[(w, c) for (w, t, c) in chunk_tags] for chunk_tags in test_chunks]\n",
    "print(train1[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## **Entrenar un etiquetador de modelo oculto de Markov**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "executionInfo": {
     "elapsed": 911,
     "status": "ok",
     "timestamp": 1665078332620,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "smdvsXMMW1fc"
   },
   "outputs": [],
   "source": [
    "from nltk.tag import hmm\n",
    "hmm_chunker = hmm.HiddenMarkovModelTagger.train(train)\n",
    "hmm_chunker1 = hmm.HiddenMarkovModelTagger.train(train1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Usar el etiquetador entrenado para etiquetar cada frase del conjunto de test**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 1943,
     "status": "ok",
     "timestamp": 1665078801093,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "rcdTqLPyZ2By",
    "outputId": "55316478-6b83-411d-b016-89bba2885505"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('Rockwell', 'NNP', 'B-NP', 'B-NP'), ('International', 'NNP', 'I-NP', 'I-NP'), ('Corp.', 'NNP', 'I-NP', 'I-NP'), (\"'s\", 'POS', 'B-NP', 'B-NP'), ('Tulsa', 'NNP', 'I-NP', 'I-NP'), ('unit', 'NN', 'I-NP', 'I-NP'), ('said', 'VBD', 'B-VP', 'B-VP'), ('it', 'PRP', 'B-NP', 'B-NP'), ('signed', 'VBD', 'B-VP', 'B-VP'), ('a', 'DT', 'B-NP', 'B-NP'), ('tentative', 'JJ', 'I-NP', 'I-NP'), ('agreement', 'NN', 'I-NP', 'I-NP'), ('extending', 'VBG', 'B-VP', 'B-VP'), ('its', 'PRP$', 'B-NP', 'B-NP'), ('contract', 'NN', 'I-NP', 'I-NP'), ('with', 'IN', 'B-PP', 'B-PP'), ('Boeing', 'NNP', 'B-NP', 'B-NP'), ('Co.', 'NNP', 'I-NP', 'I-NP'), ('to', 'TO', 'B-VP', 'B-VP'), ('provide', 'VB', 'I-VP', 'I-VP'), ('structural', 'JJ', 'B-NP', 'B-NP'), ('parts', 'NNS', 'I-NP', 'I-NP'), ('for', 'IN', 'B-PP', 'B-PP'), ('Boeing', 'NNP', 'B-NP', 'B-NP'), (\"'s\", 'POS', 'B-NP', 'B-NP'), ('747', 'CD', 'I-NP', 'I-NP'), ('jetliners', 'NNS', 'I-NP', 'I-NP'), ('.', '.', 'O', 'O')]\n",
      "[('Rockwell', 'NNP', 'B-NP', 'B-NP'), ('International', 'NNP', 'I-NP', 'I-NP'), ('Corp.', 'NNP', 'I-NP', 'I-NP'), (\"'s\", 'POS', 'B-NP', 'B-NP'), ('Tulsa', 'NNP', 'I-NP', 'I-NP'), ('unit', 'NN', 'I-NP', 'I-NP'), ('said', 'VBD', 'B-VP', 'B-VP'), ('it', 'PRP', 'B-NP', 'B-NP'), ('signed', 'VBD', 'B-VP', 'B-VP'), ('a', 'DT', 'B-NP', 'B-NP'), ('tentative', 'JJ', 'I-NP', 'I-NP'), ('agreement', 'NN', 'I-NP', 'I-NP'), ('extending', 'VBG', 'B-VP', 'B-PP'), ('its', 'PRP$', 'B-NP', 'B-NP'), ('contract', 'NN', 'I-NP', 'I-NP'), ('with', 'IN', 'B-PP', 'B-PP'), ('Boeing', 'NNP', 'B-NP', 'B-NP'), ('Co.', 'NNP', 'I-NP', 'I-NP'), ('to', 'TO', 'B-VP', 'B-VP'), ('provide', 'VB', 'I-VP', 'I-VP'), ('structural', 'JJ', 'B-NP', 'B-NP'), ('parts', 'NNS', 'I-NP', 'I-NP'), ('for', 'IN', 'B-PP', 'B-PP'), ('Boeing', 'NNP', 'B-NP', 'B-NP'), (\"'s\", 'POS', 'B-NP', 'B-VP'), ('747', 'CD', 'I-NP', 'B-NP'), ('jetliners', 'NNS', 'I-NP', 'I-NP'), ('.', '.', 'O', 'O')]\n"
     ]
    }
   ],
   "source": [
    "test_etiquetado = []\n",
    "for sent in test_chunks:\n",
    "    frase = [(t) for (w, t, c) in sent]\n",
    "    tag = hmm_chunker.tag(frase)\n",
    "    # [word, pos, chunk_gol, chunk_predicha]\n",
    "    tag_test = [(sent[i][0], sent[i][1], sent[i][2], tag[i][1]) for i in range(len(tag))]\n",
    "    test_etiquetado.append(tag_test)\n",
    "print(test_etiquetado[0])\n",
    "\n",
    "test_etiquetado1 = []\n",
    "for sent in test_chunks:\n",
    "    frase = [(w) for (w, t, c) in sent]\n",
    "    tag = hmm_chunker1.tag(frase)\n",
    "    # [word, pos, chunk_gol, chunk_predicha]\n",
    "    tag_test = [(sent[i][0], sent[i][1], sent[i][2], tag[i][1]) for i in range(len(tag))]\n",
    "    test_etiquetado1.append(tag_test)\n",
    "print(test_etiquetado1[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Generar un archivo de texto con el conjunto de test etiquetado en chunks**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "executionInfo": {
     "elapsed": 462,
     "status": "ok",
     "timestamp": 1665079011839,
     "user": {
      "displayName": "Juan Felipe Jaramillo Hernandez",
      "userId": "00110120478197701536"
     },
     "user_tz": -120
    },
    "id": "_KSwHGg8cCeK"
   },
   "outputs": [],
   "source": [
    "with open('test_etiquetado.txt', 'w') as f:\n",
    "    for frase in test_etiquetado:\n",
    "        for word in frase:\n",
    "            f.write(\"{} {} {} {}\\n\".format(word[0], word[1], word[2], word[3]))\n",
    "        f.write(\"\\n\")\n",
    "        \n",
    "with open('test_etiquetado1.txt', 'w') as f:\n",
    "    for frase in test_etiquetado1:\n",
    "        for word in frase:\n",
    "            f.write(\"{} {} {} {}\\n\".format(word[0], word[1], word[2], word[3]))\n",
    "        f.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## **Evaluar el etiquetador usando la librería https://github.com/sighsmile/conlleval**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os \n",
    "os.system('python conlleval.py < test_etiquetado.txt > result.txt')\n",
    "os.system('python conlleval.py < test_etiquetado1.txt > result1.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "POS,IOBtags\n",
      "-----------\n",
      "processed 49389 tokens with 21891 phrases; found: 22758 phrases; correct: 19008.\n",
      "accuracy:  92.53%; (non-O)\n",
      "accuracy:  90.55%; precision:  83.52%; recall:  86.83%; FB1:  85.14\n",
      "               NP: precision:  83.76%; recall:  86.45%; FB1:  85.08  12821\n",
      "               PP: precision:  83.50%; recall:  92.16%; FB1:  87.62  5310\n",
      "               VP: precision:  82.88%; recall:  82.33%; FB1:  82.61  4627\n",
      "\n",
      "Word,IOBtags\n",
      "------------\n",
      "processed 49389 tokens with 21891 phrases; found: 21997 phrases; correct: 17639.\n",
      "accuracy:  87.77%; (non-O)\n",
      "accuracy:  88.52%; precision:  80.19%; recall:  80.58%; FB1:  80.38\n",
      "               NP: precision:  79.65%; recall:  78.46%; FB1:  79.05  12236\n",
      "               PP: precision:  85.57%; recall:  92.60%; FB1:  88.95  5206\n",
      "               VP: precision:  75.48%; recall:  73.81%; FB1:  74.63  4555\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"POS,IOBtags\")\n",
    "print(\"-----------\")\n",
    "with open('result.txt', 'r') as f:\n",
    "    print(f.read())\n",
    "\n",
    "print(\"Word,IOBtags\")\n",
    "print(\"------------\")\n",
    "with open('result1.txt', 'r') as f:\n",
    "    print(f.read())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "interpreter": {
   "hash": "0b7331accf1a0a6fe59a150214e12f4eee184c355984afe55172bb10dfecd1f7"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
