Skip to content

Instantly share code, notes, and snippets.

@apohllo
Created July 24, 2017 13:15
Show Gist options
  • Save apohllo/80e4e279269e50b237f249f18221d7ee to your computer and use it in GitHub Desktop.
Save apohllo/80e4e279269e50b237f249f18221d7ee to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"import nltk\n",
"from nltk import word_tokenize\n",
"from pyMorfologik import Morfologik\n",
"from pyMorfologik.parsing import ListParser\n",
"import re\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"___________________________ 6 0\n",
"walkosze 6 100000\n",
"coaldale 9 200000\n",
"utrzmania 12 300000\n",
"laloubere 18 400000\n",
"sypiani 24 500000\n",
"morchella 36 600000\n",
"zwierzchni 60 700000\n",
"barkowo-obojczykowego 135 800000\n",
"otwieraną 564 900000\n",
"silnikowego 5918 1000000\n"
]
}
],
"source": [
"gpath_2 = '/net/scratch/people/plgapohl/pl-v2w/Polish/'\n",
"dictionary = {}\n",
"i = 0\n",
"with open(gpath_2 + 'word-counts.txt', 'r') as f:\n",
" for line in f:\n",
" count, word = line.split()\n",
" count = int(count)\n",
" dictionary[word] = count\n",
" if i % 100000 == 0:\n",
" print(word, count, i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a 0\n",
"fosylizacyjnością 1000000\n",
"mężczyźniska 2000000\n",
"nienidziańskość 3000000\n",
"odwadniając 4000000\n",
"przemakałobyś 5000000\n",
"Szczurku 6000000\n",
"wytrzebiałbyś 7000000\n"
]
}
],
"source": [
"valid_words = set()\n",
"i = 0\n",
"with open(gpath_2 + 'polimorf.txt', 'r') as f:\n",
" for line in f:\n",
" valid_words.add(line.rstrip())\n",
" if i % 1000000 == 0:\n",
" print(line.rstrip(), i)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"first_names = set()\n",
"for file in ['imiona_meskie_utf8.txt', 'imiona_zenskie_utf8.txt']:\n",
" with open(gpath_2 + file, 'r') as f:\n",
" for line in f:\n",
" first_names.add(line.rstrip())\n",
" first_names.add(line.rstrip().lower())"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"surnames = set()\n",
"for file in ['nazwiska.txt']:\n",
" with open(gpath_2 + file, 'r') as f:\n",
" for line in f:\n",
" surnames.add(line.rstrip())\n",
" surnames.add(line.rstrip().lower())"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"FREQUENCY = dictionary\n",
"VALID = valid_words\n",
"\n",
"def freq(word): \n",
" try:\n",
" return FREQUENCY[word]\n",
" except KeyError:\n",
" return 0\n",
"\n",
"def correction(word): \n",
" \"Most probable spelling correction for word.\"\n",
" return max(candidates(word), key=freq)\n",
"\n",
"def candidates(word): \n",
" \"Generate possible spelling corrections for word.\"\n",
" return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])\n",
"\n",
"def known(words): \n",
" \"The subset of `words` that appear in the dictionary of WORDS.\"\n",
" return set(w for w in words if w in VALID)\n",
"\n",
"def edits1(word):\n",
" \"All edits that are one edit away from `word`.\"\n",
" letters = 'abcdefghijklmnopqrstuvwxyzążśźęćńół'\n",
" splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n",
" deletes = [L + R[1:] for L, R in splits if R]\n",
" transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]\n",
" replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
" inserts = [L + c + R for L, R in splits for c in letters]\n",
" return set(deletes + transposes + replaces + inserts)\n",
"\n",
"def edits2(word): \n",
" \"All edits that are two edits away from `word`.\"\n",
" return (e2 for e1 in edits1(word) for e2 in edits1(e1))"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'sie'"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"correction('sie')"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\".a\".isalpha()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def decapitalize(word):\n",
" return word[0:1].lower() + word[1:]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def most_popular_tuple(list_of_stems, frequency):\n",
" best_key, best_count = None, 0\n",
" for word in list_of_stems:\n",
" try:\n",
" current_count = frequency[word]\n",
" if(current_count > best_count):\n",
" best_key = word\n",
" best_count = current_count\n",
" except KeyError:\n",
" if(best_key == None):\n",
" best_key = word\n",
" return best_key"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def find_stems(word, stemmer, parser):\n",
" result = stemmer.stem([word], parser)\n",
" if(word.istitle()):\n",
" result += stemmer.stem([decapitalize(word)], parser)\n",
" stems = []\n",
" for tuple in result:\n",
" stems += list(tuple[1].keys())\n",
" return stems"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def select_stem(word, stemmer, parser, frequency):\n",
" stems = find_stems(word, stemmer, parser)\n",
" if(len(stems) == 0 and word.isalpha()):\n",
" stems = find_stems(correction(word), stemmer, parser)\n",
" if(len(stems) == 0):\n",
" return [word, word]\n",
" else:\n",
" return [word, most_popular_tuple(stems, frequency)]"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"DIGIT_RE = re.compile('^\\d+$')\n",
"NONLETTER_RE = re.compile('[\\W\\d_][\\w]|[\\w][\\W\\d_]')\n",
"\n",
"\n",
"def convert(line, stemmer, parser, frequency):\n",
" words = word_tokenize(line)\n",
" result = []\n",
" for word in words:\n",
" if DIGIT_RE.match(word):\n",
" result.append([word, word, f\"<number length='{len(word)}'/>\"])\n",
" elif NONLETTER_RE.search(word):\n",
" result.append([word, word, f\"<identifer length='{len(word)}'/>\"])\n",
" else:\n",
" word, base = select_stem(word, stemmer, parser, frequency)\n",
" if(word in first_names and word[0].isupper()):\n",
" result.append([word, base, f\"<first-name length='{len(word)}'/>\"])\n",
" elif(word in surnames and word[0].isupper()):\n",
" result.append([word, base, f\"<last-name length='{len(word)}'/>\"])\n",
" else:\n",
" result.append([word, base, \"\"])\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['1000', '1000', \"<number length='4'/>\"],\n",
" ['Alicja', 'Alicja', \"<first-name length='6'/>\"],\n",
" ['Kowalska', 'kowalski', \"<last-name length='8'/>\"],\n",
" ['ma', 'mieć', ''],\n",
" ['kota', 'kota', ''],\n",
" ['na', 'na', ''],\n",
" ['11aa', '11aa', \"<identifer length='4'/>\"],\n",
" ['12323', '12323', \"<number length='5'/>\"],\n",
" ['polsko-polski', 'polsko-polski', \"<identifer length='13'/>\"]]"
]
},
"execution_count": 153,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"convert(\"1000 Alicja Kowalska ma kota na 11aa 12323 polsko-polski\", stemmer, parser, dictionary)"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<_sre.SRE_Match object; span=(1, 3), match='a-'>"
]
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"NONLETTER_RE.search(\"aa-a\")"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"NONLETTER_RE.match('11aa')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"stemmer = Morfologik()\n",
"parser = ListParser()"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---------- 1 ----------\n",
"[['Dzień', 'dzień', ''], ['dobry', 'dobry', ''], [',', ',', ''], ['Marta', 'Marta', \"<first-name length='5'/>\"], ['Osuchowska', 'osuchowski', \"<last-name length='10'/>\"], ['.', '.', ''], ['W', 'w', ''], ['czym', 'co', ''], ['mogę', 'móc', ''], ['pomóc', 'pomóc', ''], ['?', '?', '']]\n",
"[['Witam', 'witać', ''], [',', ',', ''], ['chialem', 'chmal', ''], ['sie', 'si', ''], ['dowiedziec', 'dowiedzieć', ''], ['jakie', 'jaki', ''], ['posiadacie', 'posiadać', ''], ['Państwo', 'państwo', ''], ['Karty', 'karty', ''], ['Kredytowe', 'kredytowy', '']]\n",
"[['Czy', 'czy', ''], ['jest', 'być', ''], ['Pan', 'pan', ''], ['klientem', 'klient', ''], ['banku', 'bank', ''], ['?', '?', '']]\n",
"[['Jeszcze', 'jeszcze', ''], ['nie', 'nie', ''], [',', ',', ''], ['ale', 'ale', ''], ['moze', 'może', ''], ['zostane', 'zostać', '']]\n",
"[['Czego', 'co', ''], ['Pan', 'pan', ''], ['oczekuję', 'oczekiwać', ''], ['od', 'od', ''], ['karty', 'karty', ''], ['kredytowej', 'kredytowy', ''], ['?', '?', ''], ['W', 'w', ''], ['naszej', 'nasz', ''], ['ofercie', 'oferta', ''], ['jest', 'być', ''], ['ich', 'on', ''], ['kilka', 'kilka', '']]\n",
"[['interesuje', 'interesować', ''], ['mnie', 'ja', ''], ['karta', 'karta', ''], ['kredytowa', 'kredytowy', ''], [',', ',', ''], ['ktora', 'który', ''], ['bede', 'bebe', ''], ['mogl', 'móc', ''], ['uzywac', 'używać', ''], ['do', 'do', ''], ['podrozy', 'podroby', ''], ['sluzbowych', 'służbowy', '']]\n",
"[['pytanie', 'pytanie', ''], ['czy', 'czy', ''], ['trzeba', 'trzeba', ''], ['miec', 'mieć', ''], ['rachunek', 'rachunek', ''], ['firmowy', 'firmowy', ''], ['wtedy', 'wtedy', ''], ['u', 'u', ''], ['Państwa', 'państwo', ''], ['?', '?', '']]\n",
"[['Czy', 'czy', ''], ['pana', 'pan', ''], ['interesuje', 'interesować', ''], ['karta', 'karta', ''], ['firmowa', 'firmowy', ''], ['czy', 'czy', ''], ['osobista', 'osobisty', ''], ['kredytowa', 'kredytowy', ''], ['?', '?', '']]\n",
"[['No', 'no', ''], ['wlasnie', 'wlać', ''], ['tego', 'to', ''], ['do', 'do', ''], ['konca', 'koniec', ''], ['nie', 'nie', ''], ['wiem', 'wiedzieć', ''], [',', ',', ''], ['ale', 'ale', ''], ['chyba', 'chyba', ''], ['infywidualna', 'indywidualny', '']]\n",
"[['bo', 'bo', ''], ['rozumiem', 'rozumieć', ''], [',', ',', ''], ['ze', 'z', ''], ['nie', 'nie', ''], ['potrzebuje', 'potrzebować', ''], ['posiadac', 'posiadać', ''], ['rachunku', 'rachunek', ''], ['firmowego', 'firmowy', ''], ['wtedy', 'wtedy', '']]\n",
"[['Nie', 'nie', ''], ['jest', 'być', ''], ['konieczne', 'konieczny', ''], ['posiadanie', 'posiadać', ''], ['rachunku', 'rachunek', ''], ['w', 'w', ''], ['mBanku', 'mBank', ''], [',', ',', ''], ['aby', 'aby', ''], ['o', 'o', ''], ['nią', 'on', ''], ['wnioskować', 'wnioskować', ''], ['w', 'w', ''], ['żadnym', 'żaden', ''], ['z', 'z', ''], ['tych', 'ten', ''], ['przypadków', 'przypadek', ''], ['.', '.', '']]\n",
"[['Pytam', 'pytać', ''], [',', ',', ''], ['ponieważ', 'ponieważ', ''], ['mogę', 'móc', ''], ['Panu', 'pan', ''], ['zaproponować', 'zaproponować', ''], ['ciekawą', 'ciekawy', ''], ['ofertę', 'oferta', ''], [',', ',', ''], ['dla', 'dla', ''], ['kart', 'karty', ''], ['indywidualnych', 'indywidualny', ''], ['kredytowych', 'kredytowy', ''], ['.', '.', ''], ['Jaki', 'jak', ''], ['limit', 'limit', ''], ['Pana', 'pan', ''], ['interesuję', 'interesować', ''], ['?', '?', '']]\n",
"[['dobrze', 'dobrze', ''], [',', ',', ''], ['no', 'no', ''], ['mysle', 'mydło', ''], ['ze', 'z', ''], ['20', '20', \"<number length='2'/>\"], ['000', '000', \"<number length='3'/>\"], ['PLN', 'LN', '']]\n",
"[['Bylby', 'ryba', ''], ['ok', 'około', '']]\n",
"[['Zatem', 'zatem', ''], ['proponuję', 'proponować', ''], ['Panu', 'pan', ''], ['kartę', 'karta', ''], ['MasterCard', 'MasterCard', ''], ['Me', 'mój', ''], ['.', '.', ''], ['Jest', 'być', ''], ['to', 'to', ''], ['nazwa', 'nazwa', ''], ['karty', 'karty', ''], ['kredytowej', 'kredytowy', ''], ['.', '.', ''], ['Za', 'za', ''], ['pomocą', 'pomoc', ''], ['ponizszego', 'poniższy', ''], ['linku', 'link', ''], [',', ',', ''], ['przystępując', 'przystępować', ''], ['do', 'do', ''], ['promocji', 'promocja', ''], ['nie', 'nie', ''], ['zapłąci', 'zapłacić', ''], ['Pan', 'pan', ''], ['za', 'za', ''], ['jej', 'jej', ''], ['przyznanie', 'przyznać', ''], ['oraz', 'oraz', ''], ['wydanie', 'wydanie', ''], ['.', '.', ''], ['Dodatkowo', 'dodatkowo', ''], ['moze', 'może', ''], ['Pan', 'pan', ''], ['wybrać', 'wybrać', ''], ['własną', 'własny', ''], ['grafikę', 'grafika', ''], ['.', '.', '']]\n",
"[['Przesyłam', 'przesyłać', ''], ['link', 'link', ''], ['do', 'do', ''], ['wniosku', 'wniosek', ''], ['Można', 'można', ''], ['go', 'go', ''], ['zapisać', 'zapisać', ''], [',', ',', ''], ['będzie', 'być', ''], ['on', 'on', ''], ['również', 'również', ''], ['aktywny', 'aktywny', ''], ['po', 'po', ''], ['zakończeniu', 'zakończenie', ''], ['rozmowy', 'rozmowy', ''], ['–', '–', ''], ['link', 'link', ''], ['Karta', 'karta', ''], ['kredytowa', 'kredytowy', '']]\n",
"[['Czyli', 'czyli', ''], ['nie', 'nie', ''], ['bedac', 'badanie', ''], ['nominalnie', 'nominalnie', ''], ['klientem', 'klient', ''], ['banku', 'bank', ''], ['moge', 'może', ''], ['miec', 'mieć', ''], ['karte', 'wart', ''], ['?', '?', '']]\n",
"[['Czy', 'czy', ''], ['zakladamy', 'zakładać', ''], ['mi', 'mi', ''], ['konto', 'konto', ''], ['i', 'i', ''], ['dostęp', 'dostęp', ''], ['do', 'do', ''], ['Bankowości', 'bankowość', ''], ['Internetowej', 'internetowy', ''], ['?', '?', '']]\n",
"[['Tak', 'tak', ''], [',', ',', ''], ['oczywiście', 'oczywiście', ''], ['.', '.', ''], ['Będzie', 'być', ''], ['Pan', 'pan', ''], ['posiadał', 'posiadać', ''], ['kartę', 'karta', ''], ['i', 'i', ''], ['dostęp', 'dostęp', ''], ['do', 'do', ''], ['niej', 'on', ''], ['przez', 'przez', ''], ['internet', 'internet', ''], ['.', '.', '']]\n",
"[['Czyli', 'czyli', ''], ['musze', 'musza', ''], ['wypełnić', 'wypełnić', ''], ['ten', 'ten', ''], ['formularz', 'formularz', ''], ['i', 'i', ''], ['to', 'to', ''], ['wszystko', 'wszystko', ''], ['?', '?', '']]\n",
"[['Tak', 'tak', ''], [',', ',', ''], ['zgadza', 'zgadzać', ''], ['się', 'się', ''], ['.', '.', '']]\n",
"[['Proszę', 'prosić', ''], ['go', 'go', ''], ['wypełnić', 'wypełnić', ''], ['teraz-sprawdzę', 'teraz-sprawdzę', \"<identifer length='14'/>\"], ['jego', 'on', ''], ['poprawność', 'poprawność', ''], ['dla', 'dla', ''], ['Pana', 'pan', ''], ['.', '.', '']]\n",
"[['Raczej', 'raczej', ''], ['zrobie', 'zrób', ''], ['to', 'to', ''], ['w', 'w', ''], ['domu', 'dom', ''], ['wieczorem', 'wieczór', ''], [',', ',', ''], ['ale', 'ale', ''], ['wiem', 'wiedzieć', ''], ['jak', 'jak', ''], ['to', 'to', ''], ['dziala', 'działać', ''], [',', ',', ''], ['czyli', 'czyli', ''], ['zadzwonie', 'zadzwonić', ''], ['na', 'na', ''], ['eksperta', 'ekspert', ''], ['(', '(', ''], ['chatujac', 'chatować', ''], [')', ')', ''], ['i', 'i', ''], ['przesle', 'przesłać', ''], ['ten', 'ten', ''], ['formularz', 'formularz', '']]\n",
"[['bardzo', 'bardzo', ''], ['dziękuje', 'dziękować', ''], ['za', 'za', ''], ['pomoc', 'pomoc', '']]\n",
"[['aha', 'aha', ''], [',', ',', ''], ['a', 'a', ''], ['czy', 'czy', ''], ['ta', 'ten', ''], ['karta', 'karta', ''], ['ma', 'mieć', ''], ['jakis', 'jaki', ''], ['termin', 'termin', ''], ['splaty', 'spłata', ''], ['?', '?', '']]\n",
"[['Czas', 'czas', ''], ['bezodsetkowy', 'bezodsetkowy', ''], ['to', 'to', ''], ['54', '54', \"<number length='2'/>\"], ['dni', 'dzień', ''], ['.', '.', ''], ['Proszę', 'prosić', ''], ['zapisać', 'zapisać', ''], ['link', 'link', ''], ['i', 'i', ''], ['z', 'z', ''], ['niego', 'on', ''], ['skorzystać', 'skorzystać', ''], [',', ',', ''], ['aby', 'aby', ''], ['miał', 'miał', ''], ['Pan', 'pan', ''], ['zagwarantowane', 'zagwarantować', ''], ['opisane', 'opisać', ''], ['promocyjne', 'promocyjny', ''], ['warunki', 'warunki', ''], ['.', '.', '']]\n",
"[['Dziekuje', 'dziękować', ''], ['pięknie', 'pięknie', ''], ['za', 'za', ''], ['rozmowe', 'rozmowy', '']]\n",
"[['Ciesze', 'ciesać', ''], ['się', 'się', ''], [',', ',', ''], ['ze', 'z', ''], ['mogłam', 'móc', ''], ['pomóc', 'pomóc', '']]\n",
"[['Do', 'do', ''], ['uslyszenia', 'usłyszeć', '']]\n"
]
}
],
"source": [
"for i in range(1):\n",
" print(f'---------- {i+1} ----------')\n",
" with open(f'/net/people/plgapohl/notebooks/data/ailleron-{i+1}.json') as f:\n",
" for line in f:\n",
" tuples = convert(json.loads(line)['content'], stemmer, parser, dictionary)\n",
" print(tuples)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ła'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"Łaaa\"[:2].lower()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment