Skip to content

Instantly share code, notes, and snippets.

@mindey
Created October 21, 2017 12:00
Show Gist options
  • Save mindey/da4318e86217e5b89d226d2a294faf4c to your computer and use it in GitHub Desktop.
Save mindey/da4318e86217e5b89d226d2a294faf4c to your computer and use it in GitHub Desktop.
import os
import collections
import langdetect
LANGUAGE_CODES = os.listdir(langdetect.PROFILES_DIRECTORY)
def detect_language(text, max_length=2):
""" Make sure we return N-letter keys for languages"""
shorter = {'zh-cn': 'cn', 'zh-tw': 'zh'}
code = langdetect.detect(text)
short_code = shorter.get(code) if len(code) > max_length else code
return short_code[:max_length]
def split(text, sep='.:', ends=['\n', ':'], min_key_length=2, max_key_length=2,
autodetect=True, pargraph_sep='\n\n', markdown=False, title=False):
"""
Splits text by `sep`, and combines texts with same keys before `ends`,
if they are not shorter/longer than `min_key_length` and `max_key_length`.
Assigns the rest of the parts to key called None. Returns a dict.
Detects language if not present, treating each paragraph separately.
Tip:
Change 'markdown' to True to get result combined back to markdown.
Pass title=True to convert to title version, using the ':' as end.
"""
result = collections.defaultdict(str)
lang_seq = []
for token in text.split(sep):
if not token:
continue
name = None
chunk = token
if len(token[:max_key_length+1]) == max_key_length+1:
for symbol in ends:
pos = token[:max_key_length+1].find(symbol)
if min_key_length <= pos <= max_key_length:
name, chunk = token[:pos], token[pos+1:]
if not name:
if autodetect:
paragraphs = chunk.split(pargraph_sep)
number_of_paragraphs = len(paragraphs)
for i, paragraph in enumerate(paragraphs):
if not paragraph:
continue
name = detect_language(paragraph)
result[name] += paragraph
if i < number_of_paragraphs - 1:
result[name] += pargraph_sep
if name not in lang_seq:
lang_seq.append(name)
else:
result[name] += chunk
if name not in lang_seq:
lang_seq.append(name)
else:
result[name] += chunk
if name not in lang_seq:
lang_seq.append(name)
result = collections.OrderedDict(
[(lang, result[lang]) for lang in lang_seq]
)
if markdown:
text_md = ''
for lang in lang_seq:
text_md += '{sep}{lang}{end}{text}'.format(
sep = sep,
lang = lang,
end = ends[0] if not title else ends[1],
text = result[lang],
)
return text_md.strip()
return result
def test_title():
text = '.:en:hello world.:lt:smart world.:ja:今日は、世界'
expect = collections.OrderedDict(
[('en', 'hello world'),
('lt', 'smart world'),
('ja', '今日は、世界')]
)
assert(
split(text) == expect
)
def test_body():
text ='''.:en
some text
which is good
.:ru
несколько текста
.:en
so want to try
.:lt
nieko sau, viskas gerai
.:cn
中文也可以的
'''
expect = collections.OrderedDict(
[('en', 'some text\n\nwhich is good\n\nso want to try\n\n'),
('ru', 'несколько текста\n\n'),
('lt', 'nieko sau, viskas gerai\n\n'),
('cn', '中文也可以的\n')]
)
assert(
split(text) == expect
)
def test_partial_autodetect():
text = 'hello world.:lt:smart world.:ja:今日は、世界'
expect = collections.OrderedDict(
[('en', 'hello world'),
('lt', 'smart world'),
('ja', '今日は、世界')]
)
result = split(text)
assert(
result == expect
)
def test_autodetect():
text = '''some text
which is good
несколько текста
so want to try
šienpjovys džemas
中文也可以的
'''
expect = collections.OrderedDict(
[('en', 'some text\nwhich is good\n\nso want to try\n\n'),
('ru', 'несколько текста\n\n'),
('lt', 'šienpjovys džemas\n\n'),
('cn', '中文也可以的\n')]
)
result = split(text)
assert(
result == expect
)
def test_markdown():
text = '''中文也可以的
some text
which is good
несколько текста
so want to try
šienpjovys džemas'''
expect = '''.:cn
中文也可以的
.:en
some text
which is good
so want to try
.:ru
несколько текста
.:lt
šienpjovys džemas'''
result = split(text, markdown=True)
assert(
result == expect
)
def test_markdown_title():
text = '''世界,你好.:lt:Sveikas, Pasauli'''
expect = '.:cn:世界,你好.:lt:Sveikas, Pasauli'
result = split(text, markdown=True, title=True)
assert(
result == expect
)
if __name__ == '__main__':
test_title()
test_body()
test_partial_autodetect()
test_autodetect()
test_markdown()
test_markdown_title()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment