from pyspark.sql.types import StructType, StructField, StringType
rdd = sc.parallelize([("moo this has stopwords b", "bat this one does not"),
("apple orange banana", "cookie jar bla la")])
schema = StructType([StructField('entity', StringType(), True),
StructField('cleaned_entity', StringType(), True),
])
# create dataframe
df3 = sqlContext.createDataFrame(rdd, schema)
Last active
February 27, 2020 12:13
-
-
Save caseyliqb/a04b80f4dfda569a6c957ee8f29dc997 to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
output: