Skip to content

Instantly share code, notes, and snippets.

@Puzer
Last active October 9, 2019 17:14
Show Gist options
  • Save Puzer/432e224a56f49ae0c52d91ea4b919e6b to your computer and use it in GitHub Desktop.
Save Puzer/432e224a56f49ae0c52d91ea4b919e6b to your computer and use it in GitHub Desktop.
bpemb_ru = BPEmb(lang='ru', dim=50)
def extract_text(json_data):
ru_text = list(filter(lambda x: any(1040 <= ord(y) <= 1103 for y in x), json_data.split('"')))
return ' '.join(ru_text)
def embed_text(text):
ids = bpemb_ru.encode_ids(text)
# ids = list(filter(lambda x: x in ids_white_list, ids))
return bpemb_ru.emb.vectors[ids].sum(axis=0)
def preprocess_stories_description(df):
story_emb_df = pd.DataFrame(np.vstack(df.story_json.map(extract_text).map(embed_text))).add_prefix('story_bpe__')
story_emb_df['story_id'] = df['story_id']
return story_emb_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment