spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Andy Davidson <A...@SantaCruzIntegration.com>
Subject pyspark unable to create UDF: java.lang.RuntimeException: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a directory: /tmp tmp
Date Thu, 18 Aug 2016 21:56:16 GMT
For unknown reason I can not create UDF when I run the attached notebook on
my cluster. I get the following error

Py4JJavaError: An error occurred while calling
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException:
org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a
directory: /tmp tmp

The notebook runs fine on my Mac

In general I am able to run non UDF spark code with out any trouble

I start the notebook server as the user “ec2-user" and uses master URL
spark://ec2-51-215-120-63.us-west-1.compute.amazonaws.com:6066


I found the following message in the notebook server log file. I have log
level set to warn

16/08/18 21:38:45 WARN ObjectStore: Version information not found in
metastore. hive.metastore.schema.verification is not enabled so recording
the schema version 1.2.0

16/08/18 21:38:45 WARN ObjectStore: Failed to get database default,
returning NoSuchObjectException



The cluster was originally created using
spark-1.6.1-bin-hadoop2.6/ec2/spark-ec2


#from pyspark.sql import SQLContext, HiveContext
#sqlContext = SQLContext(sc)
​
#from pyspark.sql import DataFrame
#from pyspark.sql import functions
​
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
​
print("spark version: {}".format(sc.version))
​
import sys
print("python version: {}".format(sys.version))
spark version: 1.6.1
python version: 3.4.3 (default, Apr  1 2015, 18:10:40)
[GCC 4.8.2 20140120 (Red Hat 4.8.2-16)]



# functions.lower() raises
# py4j.Py4JException: Method lower([class java.lang.String]) does not exist
# work around define a UDF
toLowerUDFRetType = StringType()
#toLowerUDF = udf(lambda s : s.lower(), toLowerUDFRetType)
toLowerUDF = udf(lambda s : s.lower(), StringType())
You must build Spark with Hive. Export 'SPARK_HIVE=true' and run build/sbt
assembly
Py4JJavaErrorTraceback (most recent call last)
<ipython-input-2-2e0f7c0bb4f9> in <module>()
      4 toLowerUDFRetType = StringType()
      5 #toLowerUDF = udf(lambda s : s.lower(), toLowerUDFRetType)
----> 6 toLowerUDF = udf(lambda s : s.lower(), StringType())

/root/spark/python/pyspark/sql/functions.py in udf(f, returnType)
   1595     [Row(slen=5), Row(slen=3)]
   1596     """
-> 1597     return UserDefinedFunction(f, returnType)
   1598 
   1599 blacklist = ['map', 'since', 'ignore_unicode_prefix']

/root/spark/python/pyspark/sql/functions.py in __init__(self, func,
returnType, name)
   1556         self.returnType = returnType
   1557         self._broadcast = None
-> 1558         self._judf = self._create_judf(name)
   1559 
   1560     def _create_judf(self, name):

/root/spark/python/pyspark/sql/functions.py in _create_judf(self, name)
   1567         pickled_command, broadcast_vars, env, includes =
_prepare_for_python_RDD(sc, command, self)
   1568         ctx = SQLContext.getOrCreate(sc)
-> 1569         jdt = ctx._ssql_ctx.parseDataType(self.returnType.json())
   1570         if name is None:
   1571             name = f.__name__ if hasattr(f, '__name__') else
f.__class__.__name__

/root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
    681         try:
    682             if not hasattr(self, '_scala_HiveContext'):
--> 683                 self._scala_HiveContext = self._get_hive_ctx()
    684             return self._scala_HiveContext
    685         except Py4JError as e:

/root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
    690 
    691     def _get_hive_ctx(self):
--> 692         return self._jvm.HiveContext(self._jsc.sc())
    693 
    694     def refreshTable(self, tableName):

/root/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in
__call__(self, *args)
   1062         answer = self._gateway_client.send_command(command)
   1063         return_value = get_return_value(
-> 1064             answer, self._gateway_client, None, self._fqn)
   1065 
   1066         for temp_arg in temp_args:

/root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     43     def deco(*a, **kw):
     44         try:
---> 45             return f(*a, **kw)
     46         except py4j.protocol.Py4JJavaError as e:
     47             s = e.java_exception.toString()

/root/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name)
    306                 raise Py4JJavaError(
    307                     "An error occurred while calling {0}{1}{2}.\n".
--> 308                     format(target_id, ".", name), value)
    309             else:
    310                 raise Py4JError(

Py4JJavaError: An error occurred while calling
None.org.apache.spark.sql.hive.HiveContext.
: java.lang.RuntimeException:
org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is not a
directory: /tmp tmp
	at 
org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1
489)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys
tem.java:2979)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j
ava:2932)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java
:2911)
	at 
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS
erver.java:649)
	at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator
PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417)
	at 
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam
enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096)
	at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto
bufRpcEngine.java:453)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja
va:1408)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689)

	at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
	at 
org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:20
4)
	at 
org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedC
lientLoader.scala:238)
	at 
org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.s
cala:218)
	at 
org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:208)
	at 
org.apache.spark.sql.hive.HiveContext.functionRegistry$lzycompute(HiveContex
t.scala:462)
	at 
org.apache.spark.sql.hive.HiveContext.functionRegistry(HiveContext.scala:461
)
	at org.apache.spark.sql.UDFRegistration.<init>(UDFRegistration.scala:40)
	at org.apache.spark.sql.SQLContext.<init>(SQLContext.scala:330)
	at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:90)
	at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces
sorImpl.java:62)
	at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc
torAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:214)
	at 
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:7
9)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: Parent path is
not a directory: /tmp tmp
	at 
org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1
489)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys
tem.java:2979)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j
ava:2932)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java
:2911)
	at 
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS
erver.java:649)
	at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator
PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417)
	at 
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam
enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096)
	at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto
bufRpcEngine.java:453)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja
va:1408)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689)

	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at 
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAcces
sorImpl.java:62)
	at 
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstruc
torAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at 
org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.j
ava:90)
	at 
org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.
java:57)
	at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2110)
	at org.apache.hadoop.hdfs.DFSClient.mkdirs(DFSClient.java:2079)
	at 
org.apache.hadoop.hdfs.DistributedFileSystem.mkdirs(DistributedFileSystem.ja
va:543)
	at 
org.apache.hadoop.hive.ql.exec.Utilities.createDirsWithPermission(Utilities.
java:3679)
	at 
org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionStat
e.java:597)
	at 
org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionStat
e.java:554)
	at 
org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
	... 21 more
Caused by: 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.fs.FileAlreadyExists
Exception): Parent path is not a directory: /tmp tmp
	at 
org.apache.hadoop.hdfs.server.namenode.FSDirectory.mkdirs(FSDirectory.java:1
489)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInternal(FSNamesys
tem.java:2979)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirsInt(FSNamesystem.j
ava:2932)
	at 
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.mkdirs(FSNamesystem.java
:2911)
	at 
org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.mkdirs(NameNodeRpcS
erver.java:649)
	at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslator
PB.mkdirs(ClientNamenodeProtocolServerSideTranslatorPB.java:417)
	at 
org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNam
enodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java:44096)
	at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(Proto
bufRpcEngine.java:453)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1002)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1695)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1691)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:415)
	at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.ja
va:1408)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1689)

	at org.apache.hadoop.ipc.Client.call(Client.java:1225)
	at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.jav
a:202)
	at com.sun.proxy.$Proxy21.mkdirs(Unknown Source)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62
)
	at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl
.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at 
org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocati
onHandler.java:164)
	at 
org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHand
ler.java:83)
	at com.sun.proxy.$Proxy21.mkdirs(Unknown Source)
	at 
org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.mkdirs(
ClientNamenodeProtocolTranslatorPB.java:425)
	at org.apache.hadoop.hdfs.DFSClient.primitiveMkdir(DFSClient.java:2108)
	... 27 more





Mime
View raw message