Skip to content

Instantly share code, notes, and snippets.

@rxin
Created June 3, 2015 08:19
Show Gist options
  • Save rxin/577f7e15545a1edc6f88 to your computer and use it in GitHub Desktop.
Save rxin/577f7e15545a1edc6f88 to your computer and use it in GitHub Desktop.
In [1]: df = sqlContext.read.json("examples/src/main/resources/people.json")
In [2]: df.withColumn('a b', df.age)
Out[2]: DataFrame[age: bigint, name: string, a b: bigint]
In [3]: df.withColumn('a b', df.age).write.parquet('test-parquet.out')
15/06/03 01:14:56 ERROR InsertIntoHadoopFsRelation: Aborting job.
java.lang.RuntimeException: Attribute name "a b" contains invalid character(s) among " ,;{}() =". Please use alias to rename it.
at scala.sys.package$.error(package.scala:27)
at org.apache.spark.sql.parquet.ParquetTypesConverter$$anonfun$checkSpecialCharacters$2.apply(ParquetTypes.scala:414)
at org.apache.spark.sql.parquet.ParquetTypesConverter$$anonfun$checkSpecialCharacters$2.apply(ParquetTypes.scala:412)
at scala.collection.immutable.List.foreach(List.scala:318)
at org.apache.spark.sql.parquet.ParquetTypesConverter$.checkSpecialCharacters(ParquetTypes.scala:412)
at org.apache.spark.sql.parquet.ParquetTypesConverter$.convertToString(ParquetTypes.scala:423)
at org.apache.spark.sql.parquet.RowWriteSupport$.setSchema(ParquetTableSupport.scala:383)
at org.apache.spark.sql.parquet.ParquetRelation2.prepareJobForWrite(newParquet.scala:230)
at org.apache.spark.sql.sources.BaseWriterContainer.driverSideSetup(commands.scala:276)
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.insert(commands.scala:121)
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.run(commands.scala:104)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:68)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:148)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:87)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:920)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:920)
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:338)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:144)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:135)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:281)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:744)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-3-fd925f614f74> in <module>()
----> 1 df.withColumn('a b', df.age).write.parquet('test-parquet.out')
/scratch/rxin/spark/python/pyspark/sql/readwriter.pyc in parquet(self, path, mode)
350 >>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))
351 """
--> 352 self._jwrite.mode(mode).parquet(path)
353
354 @since(1.4)
/Users/rxin/anaconda/lib/python2.7/site-packages/py4j-0.8.1-py2.7.egg/py4j/java_gateway.pyc in __call__(self, *args)
535 answer = self.gateway_client.send_command(command)
536 return_value = get_return_value(answer, self.gateway_client,
--> 537 self.target_id, self.name)
538
539 for temp_arg in temp_args:
/Users/rxin/anaconda/lib/python2.7/site-packages/py4j-0.8.1-py2.7.egg/py4j/protocol.pyc in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o35.parquet.
: java.lang.NullPointerException
at org.apache.spark.sql.sources.BaseWriterContainer.abortJob(commands.scala:362)
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.insert(commands.scala:127)
at org.apache.spark.sql.sources.InsertIntoHadoopFsRelation.run(commands.scala:104)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult$lzycompute(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.sideEffectResult(commands.scala:57)
at org.apache.spark.sql.execution.ExecutedCommand.doExecute(commands.scala:68)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:88)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:148)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:87)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd$lzycompute(SQLContext.scala:920)
at org.apache.spark.sql.SQLContext$QueryExecution.toRdd(SQLContext.scala:920)
at org.apache.spark.sql.sources.ResolvedDataSource$.apply(ddl.scala:338)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:144)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:135)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:281)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:744)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment