aboSamoor · December 23, 2015 06:59
diff --git a/Analysis of Word2Vector b/Analysis of Word2Vector
 {
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from io import open\n",
      "from glob import glob\n",
      "from numpy import asarray"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "files = glob('/media/jarra/word2vec/Word2VecExample/vectors/*')\n",
      "words = []\n",
      "embeddings = []\n",
      "for file in files:\n",
      "    for line in open(file):\n",
      "        ws = line.strip().split()\n",
      "        if len(ws) != 251:\n",
      "            continue\n",
      "        words.append(ws[0])\n",
      "        embeddings.append([float(x) for x in ws[1:]])\n",
      "embeddings = asarray(embeddings)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 54
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.neighbors import NearestNeighbors\n",
      "\n",
      "knn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree', p=2)\n",
      "knn.fit(embeddings)\n",
      "\n",
      "word_id = {w:i for i, w in enumerate(words)}\n",
      "id_word = {i:w for i, w in enumerate(words)}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 74
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_neighbors(word):\n",
      "    id_ = word_id[word]\n",
      "    point = embeddings[id_]\n",
      "    get_knn_point(point)\n",
      "\n",
      "def get_knn_point(point):\n",
      "    distances, indices = knn.kneighbors(point)\n",
      "    for i, (index, d) in enumerate(zip(indices[0], distances[0])):\n",
      "        print '{: <3}{: <20}{: <10}'.format(i, id_word[index], d)\n",
      "        \n",
      "def get_knn_phrase(phrase):\n",
      "    accumulator = get_phrase_point(phrase)\n",
      "    get_knn_point(accumulator)\n",
      "    \n",
      "def get_phrase_point(phrase):\n",
      "    accumulator = 0\n",
      "    for word in phrase.strip().split():\n",
      "        accumulator += embeddings[word_id[word]]\n",
      "    return accumulator"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 82
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Nearest Neighbours of words\n",
      "============================\n",
      "\n",
      "This is a typical experiment where we sort the words by their Euclidean distance.\n",
      "\n",
      "**Observations**\n",
      "\n",
      "* apple is mainly a company in these embeddings.\n",
      "\n",
      "* book and books are close to each other. The space is partitioned by the meaning not by part of speech.\n",
      "\n",
      "* The neighbors of \"the\" are not quite expected."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_neighbors('apple')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  apple               0.0       \n",
        "1  ibook               2.45441387259\n",
        "2  ilife               2.6324920856\n",
        "3  iifx                2.68339841114\n",
        "4  laserwriter         2.68894626153\n",
        "5  macintoshes         2.72100867622\n",
        "6  macintosh           2.7465602912\n",
        "7  hypercard           2.74976202767\n",
        "8  emac                2.76713004891\n",
        "9  pcjr                2.76880134334\n",
        "10 iix                 2.77605432771\n",
        "11 filemaker           2.78940634298\n",
        "12 iigs                2.80644647114\n",
        "13 macwrite            2.82640246101\n",
        "14 appleworks          2.8268557242\n",
        "15 iici                2.82994404995\n",
        "16 ultrix              2.83341137466\n",
        "17 desknote            2.8353197512\n",
        "18 idvd                2.84045199398\n",
        "19 garageband          2.84403010264\n"
       ]
      }
     ],
     "prompt_number": 86
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_neighbors('january')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  january             0.0       \n",
        "1  march               0.990577776379\n",
        "2  february            0.995405360592\n",
        "3  december            1.00770528005\n",
        "4  april               1.06711900891\n",
        "5  november            1.12036214401\n",
        "6  october             1.15229411791\n",
        "7  july                1.20908610312\n",
        "8  august              1.22016681435\n",
        "9  september           1.33152866717\n",
        "10 june                1.4797197978\n",
        "11 one                 1.80048644633\n",
        "12 nine                1.8731231034\n",
        "13 eight               1.95164583491\n",
        "14 two                 1.9520374482\n",
        "15 seven               1.95837319942\n",
        "16 six                 1.97104882071\n",
        "17 four                2.01665230496\n",
        "18 ogumahideo          2.02079829584\n",
        "19 five                2.02127420304\n"
       ]
      }
     ],
     "prompt_number": 87
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_neighbors('the')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  the                 0.0       \n",
        "1  omobe               1.11880177048\n",
        "2  zandramas           1.1371528579\n",
        "3  nyabinghi           1.13799240546\n",
        "4  smilga              1.13822478466\n",
        "5  armisael            1.13952230814\n",
        "6  braiterman          1.14003221416\n",
        "7  hoshanah            1.14125947399\n",
        "8  azetbur             1.14194853261\n",
        "9  kollupitiya         1.14295614341\n",
        "10 librian             1.14349397418\n",
        "11 grunitzky           1.14365235389\n",
        "12 hokage              1.14478282816\n",
        "13 zerchi              1.14522446395\n",
        "14 sercia              1.14551989932\n",
        "15 besigye             1.14561000423\n",
        "16 kentrat             1.14628596163\n",
        "17 boedromion          1.14709324335\n",
        "18 morlun              1.14856902356\n",
        "19 asrar               1.14892649238\n"
       ]
      }
     ],
     "prompt_number": 88
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_neighbors('book')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  book                0.0       \n",
        "1  books               1.88765684011\n",
        "2  variorum            1.90695920674\n",
        "3  miscellanies        1.91086332941\n",
        "4  blackly             1.91939505269\n",
        "5  reprinting          1.92737927322\n",
        "6  chapbook            1.93713267613\n",
        "7  aryabhatiya         1.93724649155\n",
        "8  unexpurgated        1.94173677311\n",
        "9  helaman             1.95698111353\n",
        "10 loompanics          1.9578002733\n",
        "11 invisibles          1.96178201394\n",
        "12 jasher              1.96534553896\n",
        "13 mythadventures      1.96677258553\n",
        "14 disneywar           1.97259642784\n",
        "15 pallisers           1.97346406837\n",
        "16 ripliad             1.9841095887\n",
        "17 forewords           1.98496842113\n",
        "18 epigraph            1.98529029477\n",
        "19 barsetshire         1.98571507086\n"
       ]
      }
     ],
     "prompt_number": 102
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Nearest Neighbours of phrases\n",
      "==============================\n",
      "Here, I sum up the vectors of the phrase and then look at the nearest neighbours.\n",
      "\n",
      "**Observations**\n",
      "\n",
      "* The words that make up the phrase are still the closest to the phrase.\n",
      "* Notice that not all the words have the same influence, in the phrase \"second month of the year\", only {second, month, year} are close to the phrase it does not seem that {of, the} changed the phrase embeddings that much."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_knn_phrase(\"very good\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  very                2.02757268595\n",
        "1  good                2.14511392\n",
        "2  quite               2.41730050239\n",
        "3  extremely           2.63383599952\n",
        "4  too                 2.63744652432\n",
        "5  fairly              2.65976621896\n",
        "6  bad                 2.79797117654\n",
        "7  enough              2.8268833239\n",
        "8  relatively          2.87095658184\n",
        "9  exceedingly         2.87283047902\n",
        "10 natured             2.87803582614\n",
        "11 comparatively       2.87915718488\n",
        "12 extraordinarily     2.92273364493\n",
        "13 remarkably          2.99119514728\n",
        "14 reasonably          2.99165171247\n",
        "15 poor                2.99549033148\n",
        "16 tough               2.99972125818\n",
        "17 surprisingly        3.00423470826\n",
        "18 exceptionally       3.00542705902\n",
        "19 so                  3.01978297284\n"
       ]
      }
     ],
     "prompt_number": 89
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_knn_phrase(\"not good\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  not                 2.02757268595\n",
        "1  good                2.22214639454\n",
        "2  never               2.86305661863\n",
        "3  indeed              2.8889430765\n",
        "4  bad                 2.91523993861\n",
        "5  whatever            2.92094027292\n",
        "6  nothing             2.92411460062\n",
        "7  so                  2.93375335615\n",
        "8  always              2.95124772163\n",
        "9  really              2.98023192259\n",
        "10 actually            2.98412447259\n",
        "11 therefore           2.9869738569\n",
        "12 anyway              2.99071470444\n",
        "13 only                2.99642716642\n",
        "14 merely              2.99831283526\n",
        "15 something           3.00975725443\n",
        "16 otherwise           3.02370475405\n",
        "17 perfectly           3.02575200851\n",
        "18 cannot              3.04011813612\n",
        "19 thing               3.04675479849\n"
       ]
      }
     ],
     "prompt_number": 90
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_knn_phrase(\"not bad\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  bad                 2.22214639454\n",
        "1  not                 2.41156583302\n",
        "2  good                2.74208441996\n",
        "3  really              3.05039139775\n",
        "4  never               3.07487082936\n",
        "5  wrong               3.10786612541\n",
        "6  nothing             3.11383084188\n",
        "7  indeed              3.11558700764\n",
        "8  something           3.11843176179\n",
        "9  ugly                3.13169091161\n",
        "10 actually            3.13367017276\n",
        "11 anyway              3.1372355043\n",
        "12 so                  3.15331458279\n",
        "13 simply              3.16526963427\n",
        "14 thing               3.19701706287\n",
        "15 whatever            3.19772031188\n",
        "16 merely              3.19839313588\n",
        "17 luck                3.20110591184\n",
        "18 obviously           3.20233166983\n",
        "19 it                  3.20701670968\n"
       ]
      }
     ],
     "prompt_number": 91
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_knn_phrase(\"extremely bad\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  extremely           2.41156583302\n",
        "1  bad                 2.43449467574\n",
        "2  very                2.83048791316\n",
        "3  too                 3.01777278005\n",
        "4  quite               3.02441198258\n",
        "5  good                3.07320611164\n",
        "6  ugly                3.07887115698\n",
        "7  incredibly          3.15114734449\n",
        "8  fairly              3.17279002574\n",
        "9  extraordinarily     3.18893680906\n",
        "10 relatively          3.20866241465\n",
        "11 tough               3.21635598437\n",
        "12 exceedingly         3.21955500629\n",
        "13 overly              3.23500915437\n",
        "14 dangerous           3.25019820497\n",
        "15 natured             3.27363652607\n",
        "16 unpredictable       3.28204861682\n",
        "17 exceptionally       3.29938492336\n",
        "18 unusually           3.29947611282\n",
        "19 poor                3.3073176501\n"
       ]
      }
     ],
     "prompt_number": 92
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_knn_phrase(\"second month of the year\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  year                5.0029650795\n",
        "1  month               5.04977245949\n",
        "2  week                5.58140471293\n",
        "3  second              5.6983698843\n",
        "4  months              5.74828646333\n",
        "5  days                5.78854236335\n",
        "6  fifth               5.82172430487\n",
        "7  day                 5.8519089308\n",
        "8  fourth              5.85947580542\n",
        "9  third               5.92042757004\n",
        "10 sixth               5.92176988188\n",
        "11 weeks               6.0653731202\n",
        "12 eighth              6.08155219206\n",
        "13 elul                6.08231423041\n",
        "14 ten                 6.08644369931\n",
        "15 sabbatical          6.09076069834\n",
        "16 decade              6.1035932397\n",
        "17 fortnight           6.10438356555\n",
        "18 last                6.10476745342\n",
        "19 intercalary         6.10619566139\n"
       ]
      }
     ],
     "prompt_number": 93
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "get_knn_phrase(\"the month after january\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  month               3.98166400675\n",
        "1  january             4.20514862572\n",
        "2  december            4.33863770453\n",
        "3  february            4.35422865987\n",
        "4  march               4.35696733076\n",
        "5  april               4.35786995855\n",
        "6  months              4.37816463422\n",
        "7  november            4.38389342062\n",
        "8  days                4.42426412924\n",
        "9  july                4.43140140413\n",
        "10 october             4.43658944455\n",
        "11 september           4.44553008295\n",
        "12 year                4.4474416121\n",
        "13 august              4.44804864964\n",
        "14 week                4.47481899869\n",
        "15 after               4.47831544409\n",
        "16 june                4.60817926598\n",
        "17 before              4.6250205963\n",
        "18 weeks               4.68978529386\n",
        "19 day                 4.7146339223\n"
       ]
      }
     ],
     "prompt_number": 94
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Decompose a sentence\n",
      "=====================\n",
      "\n",
      "Given a phrase a sum of the vectors of its own words, to reverse the procedure we have to solve the linear system\n",
      "```python\n",
      "Phrase = dot(Occurrence Matrix, Embeddings Matrix)\n",
      "Occurrence Matrix = dot(Phrase, Embedding Matrix Inverse)\n",
      "```\n",
      "Therefore, we need to calculate the psuedo inverse of embeddings matrix and look at the largest component of the solution.\n",
      "\n",
      "I am surprised it works really good! Of course, it is harder to decide at which component we should stop?! However, we can start building a partial solution and compare the sentence to the partial solution and stop we we start diverging from the sentence :).\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from numpy.linalg import pinv\n",
      "\n",
      "embeddings_inverse = pinv(embeddings)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def decompose(phrase):\n",
      "    words = dot(get_phrase_point(phrase), embeddings_inverse)\n",
      "    print '\\n'.join(['{: <3}{: <20}{: <10}'.format(i, id_word[x], words[x]) for i, x in enumerate(reversed(words.argsort()[-15:]))])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 98
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "decompose('not good')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  not                 0.00454078592865\n",
        "1  good                0.00289725566233\n",
        "2  couples             0.00276267718622\n",
        "3  hth                 0.00239669137487\n",
        "4  bad                 0.00231876458505\n",
        "5  neither             0.00231822672031\n",
        "6  never               0.00226820676811\n",
        "7  nowrap              0.00212882344283\n",
        "8  nor                 0.00210485322845\n",
        "9  nothing             0.00202502414415\n",
        "10 cannot              0.00196498505034\n",
        "11 poorly              0.00192701451615\n",
        "12 luck                0.00189786580745\n",
        "13 unable              0.00188411258273\n",
        "14 dornier             0.00184270732512\n"
       ]
      }
     ],
     "prompt_number": 99
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "decompose('second month of the year')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  year                0.00818805499276\n",
        "1  month               0.00751032803789\n",
        "2  gregorian           0.00636465953281\n",
        "3  calendar            0.00546181072521\n",
        "4  second              0.00539502185446\n",
        "5  months              0.00538200401731\n",
        "6  week                0.0052643120389\n",
        "7  equinox             0.00497019081682\n",
        "8  fifth               0.00486776245734\n",
        "9  vernal              0.00484171622925\n",
        "10 years               0.00478449424801\n",
        "11 days                0.00476471815298\n",
        "12 day                 0.00470655757576\n",
        "13 fourth              0.00468227683716\n",
        "14 staggered           0.00467989278889\n"
       ]
      }
     ],
     "prompt_number": 100
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "decompose('president of united states is barrack obama')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0  united              0.0117301004546\n",
        "1  states              0.0112717923368\n",
        "2  wisconsinaccording  0.0100026352853\n",
        "3  bureau              0.00920992570253\n",
        "4  kingdom             0.00792875990308\n",
        "5  naturalized         0.00737911901475\n",
        "6  lyndon              0.00713012544686\n",
        "7  nominees            0.0066013885303\n",
        "8  retrospectively     0.00656759081726\n",
        "9  president           0.00651313494474\n",
        "10 tempore             0.0064619825011\n",
        "11 emirates            0.00613937108443\n",
        "12 vice                0.00582700772234\n",
        "13 senate              0.00565314532246\n",
        "14 presidential        0.00559438495095\n"
       ]
      }
     ],
     "prompt_number": 101
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Influence of Frequenct on Vector Norm\n",
      "======================================\n",
      "\n",
      "The results do not show anything obvious, again we do not know how these vectors were trained!"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "norms = (embeddings ** 2).sum(axis=1) ** 0.5\n",
      "indices = norms.argsort()\n",
      "print \"Largest vectors\"\n",
      "print '\\n'.join(['{: <3}{: <20}{: <10}'.format(i, id_word[x], norms[x]) for i, x in enumerate(reversed(indices[-50:]))])\n",
      "\n",
      "print \"\\n\\nSmallest vectors\"\n",
      "print '\\n'.join(['{: <3}{: <20}{: <10}'.format(i, id_word[x], norms[x]) for i, x in enumerate(indices[:50])])\n",
      "\n",
      "\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Largest vectors\n",
        "0  multilicensewithcc  10.3268512519\n",
        "1  jdforrester         10.0154157796\n",
        "2  hth                 10.0003442263\n",
        "3  uploads             9.7868881845\n",
        "4  wapcaplet           9.67973261777\n",
        "5  retrospectively     9.56892967102\n",
        "6  householder         9.33118443663\n",
        "7  islander            9.05014823454\n",
        "8  chordata            8.93656075554\n",
        "9  classis             8.76696996107\n",
        "10 namespace           8.6128387668\n",
        "11 latino              8.5979219294\n",
        "12 ordo                8.17901131632\n",
        "13 hereby              8.15611378317\n",
        "14 familia             7.95104060729\n",
        "15 attribution         7.9373325256\n",
        "16 jpeg                7.91507147932\n",
        "17 households          7.67802718897\n",
        "18 residing            7.52167119742\n",
        "19 hispanic            7.48807752896\n",
        "20 capita              7.47718165488\n",
        "21 mathbf              7.45371968805\n",
        "22 animalia            7.45246890654\n",
        "23 cdot                7.42279401665\n",
        "24 makeup              7.41894034817\n",
        "25 couples             7.34979126253\n",
        "26 frac                7.16273976919\n",
        "27 rangle              7.094790385\n",
        "28 bgcolor             7.08054397099\n",
        "29 wisconsinaccording  6.96499767777\n",
        "30 qquad               6.91234108173\n",
        "31 licence             6.88492717492\n",
        "32 mammalia            6.88148760443\n",
        "33 phylum              6.87925932348\n",
        "34 photographed        6.83425987876\n",
        "35 cdots               6.79569890312\n",
        "36 aves                6.79556269479\n",
        "37 mbox                6.73414158759\n",
        "38 median              6.72063447693\n",
        "39 mathrm              6.70333544242\n",
        "40 nowrap              6.69788441221\n",
        "41 eeeeaa              6.60665266213\n",
        "42 kommunedata         6.57814991347\n",
        "43 courtesy            6.57624628835\n",
        "44 morwen              6.55818726435\n",
        "45 langle              6.55069524732\n",
        "46 fdl                 6.49573204626\n",
        "47 regnum              6.48539039332\n",
        "48 kmd                 6.47439187599\n",
        "49 template            6.47184635727\n",
        "\n",
        "\n",
        "Smallest vectors\n",
        "0  tapairu             0.0170473963115\n",
        "1  forestburg          0.0174300506597\n",
        "2  wilcoxes            0.0175761717959\n",
        "3  ramn                0.0176507450551\n",
        "4  rotaviruses         0.0177284992033\n",
        "5  chahinkapa          0.0178666974564\n",
        "6  samogon             0.0179913211299\n",
        "7  blinne              0.0180032805344\n",
        "8  woldumar            0.0180729706745\n",
        "9  kobach              0.0182432231801\n",
        "10 lifferth            0.0182973014677\n",
        "11 cityofgoleta        0.0184365699901\n",
        "12 takere              0.0185601906779\n",
        "13 bgct                0.0186364083986\n",
        "14 strataflash         0.018769974667\n",
        "15 dicephalus          0.0188615523221\n",
        "16 </s>                0.0188826574136\n",
        "17 varima              0.018995082232\n",
        "18 kruczak             0.0190906993324\n",
        "19 tusten              0.0215875820554\n",
        "20 wurtsboro           0.0224239146449\n",
        "21 narrowsburg         0.0243615605617\n",
        "22 cochecton           0.0245859408402\n",
        "23 braeswood           0.0249381597356\n",
        "24 armaou              0.0265067232792\n",
        "25 ardenti             0.0274897423233\n",
        "26 shandaken           0.0279073472226\n",
        "27 takrur              0.0279678326118\n",
        "28 allomothers         0.0302510569072\n",
        "29 covertext           0.0303092984907\n",
        "30 poliphilo           0.0348113839282\n",
        "31 yelabuga            0.0358172907267\n",
        "32 ictineu             0.0380123283949\n",
        "33 epfcg               0.0385076033401\n",
        "34 spinn               0.0386281221651\n",
        "35 rifton              0.040149943462\n",
        "36 kenu                0.0405585154561\n",
        "37 hakudoshi           0.0408182057298\n",
        "38 crumhorns           0.041133521804\n",
        "39 schmiedeleut        0.0411884093648\n",
        "40 lehrerleut          0.0425683737181\n",
        "41 rublei              0.0435910749466\n",
        "42 surrett             0.0443516885361\n",
        "43 zaires              0.0451846235129\n",
        "44 loveppears          0.0453572166033\n",
        "45 prasanthi           0.0455711129994\n",
        "46 ritonavir           0.0455812074983\n",
        "47 polytechnicien      0.0462011677125\n",
        "48 kuebler             0.0469128573314\n",
        "49 joydev              0.0471839763585\n"
       ]
      }
     ],
     "prompt_number": 81
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from io import open\n",
	"from glob import glob\n",
	"from numpy import asarray"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"files = glob('/media/jarra/word2vec/Word2VecExample/vectors/*')\n",
	"words = []\n",
	"embeddings = []\n",
	"for file in files:\n",
	" for line in open(file):\n",
	" ws = line.strip().split()\n",
	" if len(ws) != 251:\n",
	" continue\n",
	" words.append(ws[0])\n",
	" embeddings.append([float(x) for x in ws[1:]])\n",
	"embeddings = asarray(embeddings)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 54
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from sklearn.neighbors import NearestNeighbors\n",
	"\n",
	"knn = NearestNeighbors(n_neighbors=20, algorithm='ball_tree', p=2)\n",
	"knn.fit(embeddings)\n",
	"\n",
	"word_id = {w:i for i, w in enumerate(words)}\n",
	"id_word = {i:w for i, w in enumerate(words)}"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 74
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def get_neighbors(word):\n",
	" id_ = word_id[word]\n",
	" point = embeddings[id_]\n",
	" get_knn_point(point)\n",
	"\n",
	"def get_knn_point(point):\n",
	" distances, indices = knn.kneighbors(point)\n",
	" for i, (index, d) in enumerate(zip(indices[0], distances[0])):\n",
	" print '{: <3}{: <20}{: <10}'.format(i, id_word[index], d)\n",
	" \n",
	"def get_knn_phrase(phrase):\n",
	" accumulator = get_phrase_point(phrase)\n",
	" get_knn_point(accumulator)\n",
	" \n",
	"def get_phrase_point(phrase):\n",
	" accumulator = 0\n",
	" for word in phrase.strip().split():\n",
	" accumulator += embeddings[word_id[word]]\n",
	" return accumulator"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 82
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Nearest Neighbours of words\n",
	"============================\n",
	"\n",
	"This is a typical experiment where we sort the words by their Euclidean distance.\n",
	"\n",
	"Observations\n",
	"\n",
	"* apple is mainly a company in these embeddings.\n",
	"\n",
	"* book and books are close to each other. The space is partitioned by the meaning not by part of speech.\n",
	"\n",
	"* The neighbors of \"the\" are not quite expected."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_neighbors('apple')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 apple 0.0 \n",
	"1 ibook 2.45441387259\n",
	"2 ilife 2.6324920856\n",
	"3 iifx 2.68339841114\n",
	"4 laserwriter 2.68894626153\n",
	"5 macintoshes 2.72100867622\n",
	"6 macintosh 2.7465602912\n",
	"7 hypercard 2.74976202767\n",
	"8 emac 2.76713004891\n",
	"9 pcjr 2.76880134334\n",
	"10 iix 2.77605432771\n",
	"11 filemaker 2.78940634298\n",
	"12 iigs 2.80644647114\n",
	"13 macwrite 2.82640246101\n",
	"14 appleworks 2.8268557242\n",
	"15 iici 2.82994404995\n",
	"16 ultrix 2.83341137466\n",
	"17 desknote 2.8353197512\n",
	"18 idvd 2.84045199398\n",
	"19 garageband 2.84403010264\n"
	]
	}
	],
	"prompt_number": 86
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_neighbors('january')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 january 0.0 \n",
	"1 march 0.990577776379\n",
	"2 february 0.995405360592\n",
	"3 december 1.00770528005\n",
	"4 april 1.06711900891\n",
	"5 november 1.12036214401\n",
	"6 october 1.15229411791\n",
	"7 july 1.20908610312\n",
	"8 august 1.22016681435\n",
	"9 september 1.33152866717\n",
	"10 june 1.4797197978\n",
	"11 one 1.80048644633\n",
	"12 nine 1.8731231034\n",
	"13 eight 1.95164583491\n",
	"14 two 1.9520374482\n",
	"15 seven 1.95837319942\n",
	"16 six 1.97104882071\n",
	"17 four 2.01665230496\n",
	"18 ogumahideo 2.02079829584\n",
	"19 five 2.02127420304\n"
	]
	}
	],
	"prompt_number": 87
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_neighbors('the')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 the 0.0 \n",
	"1 omobe 1.11880177048\n",
	"2 zandramas 1.1371528579\n",
	"3 nyabinghi 1.13799240546\n",
	"4 smilga 1.13822478466\n",
	"5 armisael 1.13952230814\n",
	"6 braiterman 1.14003221416\n",
	"7 hoshanah 1.14125947399\n",
	"8 azetbur 1.14194853261\n",
	"9 kollupitiya 1.14295614341\n",
	"10 librian 1.14349397418\n",
	"11 grunitzky 1.14365235389\n",
	"12 hokage 1.14478282816\n",
	"13 zerchi 1.14522446395\n",
	"14 sercia 1.14551989932\n",
	"15 besigye 1.14561000423\n",
	"16 kentrat 1.14628596163\n",
	"17 boedromion 1.14709324335\n",
	"18 morlun 1.14856902356\n",
	"19 asrar 1.14892649238\n"
	]
	}
	],
	"prompt_number": 88
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_neighbors('book')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 book 0.0 \n",
	"1 books 1.88765684011\n",
	"2 variorum 1.90695920674\n",
	"3 miscellanies 1.91086332941\n",
	"4 blackly 1.91939505269\n",
	"5 reprinting 1.92737927322\n",
	"6 chapbook 1.93713267613\n",
	"7 aryabhatiya 1.93724649155\n",
	"8 unexpurgated 1.94173677311\n",
	"9 helaman 1.95698111353\n",
	"10 loompanics 1.9578002733\n",
	"11 invisibles 1.96178201394\n",
	"12 jasher 1.96534553896\n",
	"13 mythadventures 1.96677258553\n",
	"14 disneywar 1.97259642784\n",
	"15 pallisers 1.97346406837\n",
	"16 ripliad 1.9841095887\n",
	"17 forewords 1.98496842113\n",
	"18 epigraph 1.98529029477\n",
	"19 barsetshire 1.98571507086\n"
	]
	}
	],
	"prompt_number": 102
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Nearest Neighbours of phrases\n",
	"==============================\n",
	"Here, I sum up the vectors of the phrase and then look at the nearest neighbours.\n",
	"\n",
	"Observations\n",
	"\n",
	"* The words that make up the phrase are still the closest to the phrase.\n",
	"* Notice that not all the words have the same influence, in the phrase \"second month of the year\", only {second, month, year} are close to the phrase it does not seem that {of, the} changed the phrase embeddings that much."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_knn_phrase(\"very good\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 very 2.02757268595\n",
	"1 good 2.14511392\n",
	"2 quite 2.41730050239\n",
	"3 extremely 2.63383599952\n",
	"4 too 2.63744652432\n",
	"5 fairly 2.65976621896\n",
	"6 bad 2.79797117654\n",
	"7 enough 2.8268833239\n",
	"8 relatively 2.87095658184\n",
	"9 exceedingly 2.87283047902\n",
	"10 natured 2.87803582614\n",
	"11 comparatively 2.87915718488\n",
	"12 extraordinarily 2.92273364493\n",
	"13 remarkably 2.99119514728\n",
	"14 reasonably 2.99165171247\n",
	"15 poor 2.99549033148\n",
	"16 tough 2.99972125818\n",
	"17 surprisingly 3.00423470826\n",
	"18 exceptionally 3.00542705902\n",
	"19 so 3.01978297284\n"
	]
	}
	],
	"prompt_number": 89
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_knn_phrase(\"not good\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 not 2.02757268595\n",
	"1 good 2.22214639454\n",
	"2 never 2.86305661863\n",
	"3 indeed 2.8889430765\n",
	"4 bad 2.91523993861\n",
	"5 whatever 2.92094027292\n",
	"6 nothing 2.92411460062\n",
	"7 so 2.93375335615\n",
	"8 always 2.95124772163\n",
	"9 really 2.98023192259\n",
	"10 actually 2.98412447259\n",
	"11 therefore 2.9869738569\n",
	"12 anyway 2.99071470444\n",
	"13 only 2.99642716642\n",
	"14 merely 2.99831283526\n",
	"15 something 3.00975725443\n",
	"16 otherwise 3.02370475405\n",
	"17 perfectly 3.02575200851\n",
	"18 cannot 3.04011813612\n",
	"19 thing 3.04675479849\n"
	]
	}
	],
	"prompt_number": 90
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_knn_phrase(\"not bad\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 bad 2.22214639454\n",
	"1 not 2.41156583302\n",
	"2 good 2.74208441996\n",
	"3 really 3.05039139775\n",
	"4 never 3.07487082936\n",
	"5 wrong 3.10786612541\n",
	"6 nothing 3.11383084188\n",
	"7 indeed 3.11558700764\n",
	"8 something 3.11843176179\n",
	"9 ugly 3.13169091161\n",
	"10 actually 3.13367017276\n",
	"11 anyway 3.1372355043\n",
	"12 so 3.15331458279\n",
	"13 simply 3.16526963427\n",
	"14 thing 3.19701706287\n",
	"15 whatever 3.19772031188\n",
	"16 merely 3.19839313588\n",
	"17 luck 3.20110591184\n",
	"18 obviously 3.20233166983\n",
	"19 it 3.20701670968\n"
	]
	}
	],
	"prompt_number": 91
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_knn_phrase(\"extremely bad\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 extremely 2.41156583302\n",
	"1 bad 2.43449467574\n",
	"2 very 2.83048791316\n",
	"3 too 3.01777278005\n",
	"4 quite 3.02441198258\n",
	"5 good 3.07320611164\n",
	"6 ugly 3.07887115698\n",
	"7 incredibly 3.15114734449\n",
	"8 fairly 3.17279002574\n",
	"9 extraordinarily 3.18893680906\n",
	"10 relatively 3.20866241465\n",
	"11 tough 3.21635598437\n",
	"12 exceedingly 3.21955500629\n",
	"13 overly 3.23500915437\n",
	"14 dangerous 3.25019820497\n",
	"15 natured 3.27363652607\n",
	"16 unpredictable 3.28204861682\n",
	"17 exceptionally 3.29938492336\n",
	"18 unusually 3.29947611282\n",
	"19 poor 3.3073176501\n"
	]
	}
	],
	"prompt_number": 92
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_knn_phrase(\"second month of the year\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 year 5.0029650795\n",
	"1 month 5.04977245949\n",
	"2 week 5.58140471293\n",
	"3 second 5.6983698843\n",
	"4 months 5.74828646333\n",
	"5 days 5.78854236335\n",
	"6 fifth 5.82172430487\n",
	"7 day 5.8519089308\n",
	"8 fourth 5.85947580542\n",
	"9 third 5.92042757004\n",
	"10 sixth 5.92176988188\n",
	"11 weeks 6.0653731202\n",
	"12 eighth 6.08155219206\n",
	"13 elul 6.08231423041\n",
	"14 ten 6.08644369931\n",
	"15 sabbatical 6.09076069834\n",
	"16 decade 6.1035932397\n",
	"17 fortnight 6.10438356555\n",
	"18 last 6.10476745342\n",
	"19 intercalary 6.10619566139\n"
	]
	}
	],
	"prompt_number": 93
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"get_knn_phrase(\"the month after january\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 month 3.98166400675\n",
	"1 january 4.20514862572\n",
	"2 december 4.33863770453\n",
	"3 february 4.35422865987\n",
	"4 march 4.35696733076\n",
	"5 april 4.35786995855\n",
	"6 months 4.37816463422\n",
	"7 november 4.38389342062\n",
	"8 days 4.42426412924\n",
	"9 july 4.43140140413\n",
	"10 october 4.43658944455\n",
	"11 september 4.44553008295\n",
	"12 year 4.4474416121\n",
	"13 august 4.44804864964\n",
	"14 week 4.47481899869\n",
	"15 after 4.47831544409\n",
	"16 june 4.60817926598\n",
	"17 before 4.6250205963\n",
	"18 weeks 4.68978529386\n",
	"19 day 4.7146339223\n"
	]
	}
	],
	"prompt_number": 94
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Decompose a sentence\n",
	"=====================\n",
	"\n",
	"Given a phrase a sum of the vectors of its own words, to reverse the procedure we have to solve the linear system\n",
	"```python\n",
	"Phrase = dot(Occurrence Matrix, Embeddings Matrix)\n",
	"Occurrence Matrix = dot(Phrase, Embedding Matrix Inverse)\n",
	"```\n",
	"Therefore, we need to calculate the psuedo inverse of embeddings matrix and look at the largest component of the solution.\n",
	"\n",
	"I am surprised it works really good! Of course, it is harder to decide at which component we should stop?! However, we can start building a partial solution and compare the sentence to the partial solution and stop we we start diverging from the sentence :).\n"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from numpy.linalg import pinv\n",
	"\n",
	"embeddings_inverse = pinv(embeddings)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 34
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def decompose(phrase):\n",
	" words = dot(get_phrase_point(phrase), embeddings_inverse)\n",
	" print '\\n'.join(['{: <3}{: <20}{: <10}'.format(i, id_word[x], words[x]) for i, x in enumerate(reversed(words.argsort()[-15:]))])"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 98
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"decompose('not good')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 not 0.00454078592865\n",
	"1 good 0.00289725566233\n",
	"2 couples 0.00276267718622\n",
	"3 hth 0.00239669137487\n",
	"4 bad 0.00231876458505\n",
	"5 neither 0.00231822672031\n",
	"6 never 0.00226820676811\n",
	"7 nowrap 0.00212882344283\n",
	"8 nor 0.00210485322845\n",
	"9 nothing 0.00202502414415\n",
	"10 cannot 0.00196498505034\n",
	"11 poorly 0.00192701451615\n",
	"12 luck 0.00189786580745\n",
	"13 unable 0.00188411258273\n",
	"14 dornier 0.00184270732512\n"
	]
	}
	],
	"prompt_number": 99
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"decompose('second month of the year')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 year 0.00818805499276\n",
	"1 month 0.00751032803789\n",
	"2 gregorian 0.00636465953281\n",
	"3 calendar 0.00546181072521\n",
	"4 second 0.00539502185446\n",
	"5 months 0.00538200401731\n",
	"6 week 0.0052643120389\n",
	"7 equinox 0.00497019081682\n",
	"8 fifth 0.00486776245734\n",
	"9 vernal 0.00484171622925\n",
	"10 years 0.00478449424801\n",
	"11 days 0.00476471815298\n",
	"12 day 0.00470655757576\n",
	"13 fourth 0.00468227683716\n",
	"14 staggered 0.00467989278889\n"
	]
	}
	],
	"prompt_number": 100
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"decompose('president of united states is barrack obama')"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"0 united 0.0117301004546\n",
	"1 states 0.0112717923368\n",
	"2 wisconsinaccording 0.0100026352853\n",
	"3 bureau 0.00920992570253\n",
	"4 kingdom 0.00792875990308\n",
	"5 naturalized 0.00737911901475\n",
	"6 lyndon 0.00713012544686\n",
	"7 nominees 0.0066013885303\n",
	"8 retrospectively 0.00656759081726\n",
	"9 president 0.00651313494474\n",
	"10 tempore 0.0064619825011\n",
	"11 emirates 0.00613937108443\n",
	"12 vice 0.00582700772234\n",
	"13 senate 0.00565314532246\n",
	"14 presidential 0.00559438495095\n"
	]
	}
	],
	"prompt_number": 101
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Influence of Frequenct on Vector Norm\n",
	"======================================\n",
	"\n",
	"The results do not show anything obvious, again we do not know how these vectors were trained!"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"norms = (embeddings 2).sum(axis=1) 0.5\n",
	"indices = norms.argsort()\n",
	"print \"Largest vectors\"\n",
	"print '\\n'.join(['{: <3}{: <20}{: <10}'.format(i, id_word[x], norms[x]) for i, x in enumerate(reversed(indices[-50:]))])\n",
	"\n",
	"print \"\\n\\nSmallest vectors\"\n",
	"print '\\n'.join(['{: <3}{: <20}{: <10}'.format(i, id_word[x], norms[x]) for i, x in enumerate(indices[:50])])\n",
	"\n",
	"\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"Largest vectors\n",
	"0 multilicensewithcc 10.3268512519\n",
	"1 jdforrester 10.0154157796\n",
	"2 hth 10.0003442263\n",
	"3 uploads 9.7868881845\n",
	"4 wapcaplet 9.67973261777\n",
	"5 retrospectively 9.56892967102\n",
	"6 householder 9.33118443663\n",
	"7 islander 9.05014823454\n",
	"8 chordata 8.93656075554\n",
	"9 classis 8.76696996107\n",
	"10 namespace 8.6128387668\n",
	"11 latino 8.5979219294\n",
	"12 ordo 8.17901131632\n",
	"13 hereby 8.15611378317\n",
	"14 familia 7.95104060729\n",
	"15 attribution 7.9373325256\n",
	"16 jpeg 7.91507147932\n",
	"17 households 7.67802718897\n",
	"18 residing 7.52167119742\n",
	"19 hispanic 7.48807752896\n",
	"20 capita 7.47718165488\n",
	"21 mathbf 7.45371968805\n",
	"22 animalia 7.45246890654\n",
	"23 cdot 7.42279401665\n",
	"24 makeup 7.41894034817\n",
	"25 couples 7.34979126253\n",
	"26 frac 7.16273976919\n",
	"27 rangle 7.094790385\n",
	"28 bgcolor 7.08054397099\n",
	"29 wisconsinaccording 6.96499767777\n",
	"30 qquad 6.91234108173\n",
	"31 licence 6.88492717492\n",
	"32 mammalia 6.88148760443\n",
	"33 phylum 6.87925932348\n",
	"34 photographed 6.83425987876\n",
	"35 cdots 6.79569890312\n",
	"36 aves 6.79556269479\n",
	"37 mbox 6.73414158759\n",
	"38 median 6.72063447693\n",
	"39 mathrm 6.70333544242\n",
	"40 nowrap 6.69788441221\n",
	"41 eeeeaa 6.60665266213\n",
	"42 kommunedata 6.57814991347\n",
	"43 courtesy 6.57624628835\n",
	"44 morwen 6.55818726435\n",
	"45 langle 6.55069524732\n",
	"46 fdl 6.49573204626\n",
	"47 regnum 6.48539039332\n",
	"48 kmd 6.47439187599\n",
	"49 template 6.47184635727\n",
	"\n",
	"\n",
	"Smallest vectors\n",
	"0 tapairu 0.0170473963115\n",
	"1 forestburg 0.0174300506597\n",
	"2 wilcoxes 0.0175761717959\n",
	"3 ramn 0.0176507450551\n",
	"4 rotaviruses 0.0177284992033\n",
	"5 chahinkapa 0.0178666974564\n",
	"6 samogon 0.0179913211299\n",
	"7 blinne 0.0180032805344\n",
	"8 woldumar 0.0180729706745\n",
	"9 kobach 0.0182432231801\n",
	"10 lifferth 0.0182973014677\n",
	"11 cityofgoleta 0.0184365699901\n",
	"12 takere 0.0185601906779\n",
	"13 bgct 0.0186364083986\n",
	"14 strataflash 0.018769974667\n",
	"15 dicephalus 0.0188615523221\n",
	"16 </s> 0.0188826574136\n",
	"17 varima 0.018995082232\n",
	"18 kruczak 0.0190906993324\n",
	"19 tusten 0.0215875820554\n",
	"20 wurtsboro 0.0224239146449\n",
	"21 narrowsburg 0.0243615605617\n",
	"22 cochecton 0.0245859408402\n",
	"23 braeswood 0.0249381597356\n",
	"24 armaou 0.0265067232792\n",
	"25 ardenti 0.0274897423233\n",
	"26 shandaken 0.0279073472226\n",
	"27 takrur 0.0279678326118\n",
	"28 allomothers 0.0302510569072\n",
	"29 covertext 0.0303092984907\n",
	"30 poliphilo 0.0348113839282\n",
	"31 yelabuga 0.0358172907267\n",
	"32 ictineu 0.0380123283949\n",
	"33 epfcg 0.0385076033401\n",
	"34 spinn 0.0386281221651\n",
	"35 rifton 0.040149943462\n",
	"36 kenu 0.0405585154561\n",
	"37 hakudoshi 0.0408182057298\n",
	"38 crumhorns 0.041133521804\n",
	"39 schmiedeleut 0.0411884093648\n",
	"40 lehrerleut 0.0425683737181\n",
	"41 rublei 0.0435910749466\n",
	"42 surrett 0.0443516885361\n",
	"43 zaires 0.0451846235129\n",
	"44 loveppears 0.0453572166033\n",
	"45 prasanthi 0.0455711129994\n",
	"46 ritonavir 0.0455812074983\n",
	"47 polytechnicien 0.0462011677125\n",
	"48 kuebler 0.0469128573314\n",
	"49 joydev 0.0471839763585\n"
	]
	}
	],
	"prompt_number": 81
	}
	],
	"metadata": {}
	}
	]
	}