Hi, I am following as per blog I found but not able to get pyspark working in Jupyter.
import os
import sys
os.environ[“SPARK_HOME”] = “/usr/hdp/current/spark2-client”
os.environ[“PYLIB”] = os.environ[“SPARK_HOME”] + “/python/lib”
In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ[“PYSPARK_PYTHON”] = “/usr/local/anaconda/bin/python”
os.environ[“PYSPARK_DRIVER_PYTHON”] = “/usr/local/anaconda/bin/python”
sys.path.insert(0, os.environ[“PYLIB”] +"/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ[“PYLIB”] +"/pyspark.zip")
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName(“appName”)
sc = SparkContext(conf=conf)
rdd = sc.textFile("/data/mr/wordcount/input/")
print(rdd.take(10))
sc.version
Output:
ModuleNotFoundError Traceback (most recent call last)
in
9 sys.path.insert(0, os.environ[“PYLIB”] +"/pyspark.zip")
10
—> 11 from pyspark import SparkContext, SparkConf
12 conf = SparkConf().setAppName(“appName”)
13 sc = SparkContext(conf=conf)
/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/init.py in
42
43 from pyspark.conf import SparkConf
—> 44 from pyspark.context import SparkContext
45 from pyspark.rdd import RDD
46 from pyspark.files import SparkFiles
/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/context.py in
27 from tempfile import NamedTemporaryFile
28
—> 29 from py4j.protocol import Py4JError
30
31 from pyspark import accumulators
ModuleNotFoundError: No module named ‘py4j’