- 实词:名词、动词、形容词、状态词、区别词、数词、量词、代词
- 虚词:副词、介词、连词、助词、拟声词、叹词。
n 名词
nr 人名
import numpy as np | |
import marisa_trie | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.externals import six | |
class MarisaCountVectorizer(CountVectorizer): | |
# ``CountVectorizer.fit`` method calls ``fit_transform`` so | |
# ``fit`` is not provided | |
def fit_transform(self, raw_documents, y=None): |
# Your init script | |
# | |
# Atom will evaluate this file each time a new window is opened. It is run | |
# after packages are loaded/activated and after the previous editor state | |
# has been restored. | |
# | |
# An example hack to log to the console when each text editor is saved. | |
# | |
# atom.workspace.observeTextEditors (editor) -> | |
# editor.onDidSave -> |
import tensorflow as tf | |
import numpy as np | |
if __name__ == '__main__': | |
np.random.seed(1) | |
# the size of the hidden state for the lstm (notice the lstm uses 2x of this amount so actually lstm will have state of size 2) | |
size = 1 | |
# 2 different sequences total | |
batch_size= 2 | |
# the maximum steps for both sequences is 10 |
import pandas as pd | |
import numpy as np | |
from sklearn.feature_extraction import DictVectorizer | |
def encode_onehot(df, cols): | |
""" | |
One-hot encoding is applied to columns specified in a pandas DataFrame. | |
Modified from: https://gist.github.com/kljensen/5452382 | |
from pyspark import SparkContext | |
import numpy as np | |
from sklearn.cross_validation import train_test_split, Bootstrap | |
from sklearn.datasets import make_classification | |
from sklearn.metrics import accuracy_score | |
from sklearn.tree import DecisionTreeClassifier | |
def run(sc): |
# Automaticlly install pptpd on Amazon EC2 Amazon Linux | |
# | |
# Ripped from http://blog.diahosting.com/linux-tutorial/pptpd/ | |
# pptpd source rpm packing by it's authors | |
# | |
# WARNING: | |
# first ms-dns setting to 172.16.0.23, 172.16.0.23 was showing on my | |
# /etc/resolv.conf, I'm not sure this is the same on all Amazon AWS zones. | |
# | |
# You need to adjust your "Security Groups" which you are using too. |
import multiprocessing | |
import pandas as pd | |
import numpy as np | |
def _apply_df(args): | |
df, func, kwargs = args | |
return df.apply(func, **kwargs) | |
def apply_by_multiprocessing(df, func, **kwargs): | |
workers = kwargs.pop('workers') |
#List unique values in a DataFrame column | |
pd.unique(df.column_name.ravel()) | |
#Convert Series datatype to numeric, getting rid of any non-numeric values | |
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) | |
#Grab DataFrame rows where column has certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[df.column.isin(value_list)] |