Skip to content

Instantly share code, notes, and snippets.

@prio101
Last active September 14, 2024 13:54
Show Gist options
  • Save prio101/a2c631f80d56571f854a11e48d7d7b46 to your computer and use it in GitHub Desktop.
Save prio101/a2c631f80d56571f854a11e48d7d7b46 to your computer and use it in GitHub Desktop.
only between bn and en.
from langdetect import detect_langs
from collections import defaultdict
def split_english_bangla(mixed_string):
# To hold the detected languages and their corresponding text parts
language_dict = defaultdict(str)
# Split the string into words (or tokens)
words = mixed_string.split()
# Detect language for each word
for word in words:
try:
# Detect the language and confidence
detected_lang = detect_langs(word)[0] # The most confident language
lang = detected_lang.lang
# Only keep English ('en') and Bangla ('bn')
if lang == 'bn':
language_dict[lang] += word + " "
else:
language_dict['en'] += word + " "
except:
# In case language detection fails, mark the word as 'unknown'
language_dict['unknown'] += word + " "
# Return the dictionary of languages and their corresponding text parts
return {lang: text.strip() for lang, text in language_dict.items()}
# Example mixed string
mixed_string = "Hello world! এই পৃথিবী সুন্দর। How are you? তুমি কেমন আছো?"
# Split and detect languages
result = split_english_bangla(mixed_string)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment