How are null values handled in Spark 2.0 when creating a DataFrame? Is there a way to use Option? [duplicate]
This question already has an answer here:
Problems to create DataFrame from Rows containing Option[T]
2 answers
Unable to create org.apache.spark.sql.Row with scala.None value since Spark 2.X
1 answer
I'm trying to create a DataFrame from an RDD in Apache Spark 2.0
val spark = new SparkSession()
val fieldList = List(
StructField("name",StringType,true),
StructField("age",IntegerType,true),
StructField("number",LongType,true)
)
val schema = StructType(fieldList)
val rdd = sc.parallelize(List(Row(Some("Jesus"),22,123456789),Row(Some("Peter"),30,45678912),Row(None,18,127834783))
val df = spark.createDataFrame(rdd,schema)
df.write.parquet("/Users/dummy")
The exception thrown is the following
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:279)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: scala.Some is not a valid external type for schema of string
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:276)
The problem here is that for an unknown reason the dataframe is not accepting the values of class Option. I don't understand why since in Spark 1.6 i was able to create a dataframe using Option values. I'm trying to force the use of Option here because i don't want to use null values.
Is there any way the dataFrame can accept Option values in Spark 2.0 .
scala apache-spark apache-spark-sql
marked as duplicate by user6910411
StackExchange.ready(function()
if (StackExchange.options.isMobile) return;
$('.dupe-hammer-message-hover:not(.hover-bound)').each(function()
var $hover = $(this).addClass('hover-bound'),
$msg = $hover.siblings('.dupe-hammer-message');
$hover.hover(
function()
$hover.showInfoMessage('',
messageElement: $msg.clone().show(),
transient: false,
position: my: 'bottom left', at: 'top center', offsetTop: -7 ,
dismissable: false,
relativeToBody: true
);
,
function()
StackExchange.helpers.removeMessages();
);
);
);
Nov 13 '18 at 16:10
This question has been asked before and already has an answer. If those answers do not fully address your question, please ask a new question.
add a comment |
This question already has an answer here:
Problems to create DataFrame from Rows containing Option[T]
2 answers
Unable to create org.apache.spark.sql.Row with scala.None value since Spark 2.X
1 answer
I'm trying to create a DataFrame from an RDD in Apache Spark 2.0
val spark = new SparkSession()
val fieldList = List(
StructField("name",StringType,true),
StructField("age",IntegerType,true),
StructField("number",LongType,true)
)
val schema = StructType(fieldList)
val rdd = sc.parallelize(List(Row(Some("Jesus"),22,123456789),Row(Some("Peter"),30,45678912),Row(None,18,127834783))
val df = spark.createDataFrame(rdd,schema)
df.write.parquet("/Users/dummy")
The exception thrown is the following
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:279)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: scala.Some is not a valid external type for schema of string
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:276)
The problem here is that for an unknown reason the dataframe is not accepting the values of class Option. I don't understand why since in Spark 1.6 i was able to create a dataframe using Option values. I'm trying to force the use of Option here because i don't want to use null values.
Is there any way the dataFrame can accept Option values in Spark 2.0 .
scala apache-spark apache-spark-sql
marked as duplicate by user6910411
StackExchange.ready(function()
if (StackExchange.options.isMobile) return;
$('.dupe-hammer-message-hover:not(.hover-bound)').each(function()
var $hover = $(this).addClass('hover-bound'),
$msg = $hover.siblings('.dupe-hammer-message');
$hover.hover(
function()
$hover.showInfoMessage('',
messageElement: $msg.clone().show(),
transient: false,
position: my: 'bottom left', at: 'top center', offsetTop: -7 ,
dismissable: false,
relativeToBody: true
);
,
function()
StackExchange.helpers.removeMessages();
);
);
);
Nov 13 '18 at 16:10
This question has been asked before and already has an answer. If those answers do not fully address your question, please ask a new question.
add a comment |
This question already has an answer here:
Problems to create DataFrame from Rows containing Option[T]
2 answers
Unable to create org.apache.spark.sql.Row with scala.None value since Spark 2.X
1 answer
I'm trying to create a DataFrame from an RDD in Apache Spark 2.0
val spark = new SparkSession()
val fieldList = List(
StructField("name",StringType,true),
StructField("age",IntegerType,true),
StructField("number",LongType,true)
)
val schema = StructType(fieldList)
val rdd = sc.parallelize(List(Row(Some("Jesus"),22,123456789),Row(Some("Peter"),30,45678912),Row(None,18,127834783))
val df = spark.createDataFrame(rdd,schema)
df.write.parquet("/Users/dummy")
The exception thrown is the following
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:279)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: scala.Some is not a valid external type for schema of string
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:276)
The problem here is that for an unknown reason the dataframe is not accepting the values of class Option. I don't understand why since in Spark 1.6 i was able to create a dataframe using Option values. I'm trying to force the use of Option here because i don't want to use null values.
Is there any way the dataFrame can accept Option values in Spark 2.0 .
scala apache-spark apache-spark-sql
This question already has an answer here:
Problems to create DataFrame from Rows containing Option[T]
2 answers
Unable to create org.apache.spark.sql.Row with scala.None value since Spark 2.X
1 answer
I'm trying to create a DataFrame from an RDD in Apache Spark 2.0
val spark = new SparkSession()
val fieldList = List(
StructField("name",StringType,true),
StructField("age",IntegerType,true),
StructField("number",LongType,true)
)
val schema = StructType(fieldList)
val rdd = sc.parallelize(List(Row(Some("Jesus"),22,123456789),Row(Some("Peter"),30,45678912),Row(None,18,127834783))
val df = spark.createDataFrame(rdd,schema)
df.write.parquet("/Users/dummy")
The exception thrown is the following
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:279)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at org.apache.spark.sql.SparkSession$$anonfun$5.apply(SparkSession.scala:537)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:363)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: scala.Some is not a valid external type for schema of string
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:276)
The problem here is that for an unknown reason the dataframe is not accepting the values of class Option. I don't understand why since in Spark 1.6 i was able to create a dataframe using Option values. I'm trying to force the use of Option here because i don't want to use null values.
Is there any way the dataFrame can accept Option values in Spark 2.0 .
This question already has an answer here:
Problems to create DataFrame from Rows containing Option[T]
2 answers
Unable to create org.apache.spark.sql.Row with scala.None value since Spark 2.X
1 answer
scala apache-spark apache-spark-sql
scala apache-spark apache-spark-sql
asked Nov 13 '18 at 16:07
Jesus VasquezJesus Vasquez
235
235
marked as duplicate by user6910411
StackExchange.ready(function()
if (StackExchange.options.isMobile) return;
$('.dupe-hammer-message-hover:not(.hover-bound)').each(function()
var $hover = $(this).addClass('hover-bound'),
$msg = $hover.siblings('.dupe-hammer-message');
$hover.hover(
function()
$hover.showInfoMessage('',
messageElement: $msg.clone().show(),
transient: false,
position: my: 'bottom left', at: 'top center', offsetTop: -7 ,
dismissable: false,
relativeToBody: true
);
,
function()
StackExchange.helpers.removeMessages();
);
);
);
Nov 13 '18 at 16:10
This question has been asked before and already has an answer. If those answers do not fully address your question, please ask a new question.
marked as duplicate by user6910411
StackExchange.ready(function()
if (StackExchange.options.isMobile) return;
$('.dupe-hammer-message-hover:not(.hover-bound)').each(function()
var $hover = $(this).addClass('hover-bound'),
$msg = $hover.siblings('.dupe-hammer-message');
$hover.hover(
function()
$hover.showInfoMessage('',
messageElement: $msg.clone().show(),
transient: false,
position: my: 'bottom left', at: 'top center', offsetTop: -7 ,
dismissable: false,
relativeToBody: true
);
,
function()
StackExchange.helpers.removeMessages();
);
);
);
Nov 13 '18 at 16:10
This question has been asked before and already has an answer. If those answers do not fully address your question, please ask a new question.
add a comment |
add a comment |
0
active
oldest
votes
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes