Query -2 Spark project Problem 4
Please find code and error.
val logFile = sc.textFile("/data/spark/project/NASA_access_log_Aug95.gz")
def containsHTTP(line:String):Boolean = {
val pattern = “”"(\d{3})""".r
val res = pattern.findFirstMatchIn(line)
if (res.isEmpty)
{
return false
}
else
{
return true
}
}
var urlaccesslogs = logFile.filter(containsHTTP)
// below fuction gives only HTTP //
def extractHTTP(line:String):(String) = {
var arr = line.split(" ");
arr(8)
}// the above function is working fine … gives only HTTP
var HTTPval = urlaccesslogs.map(line=>(extractHTTP(line),1))
var HTTPcnts = HTTPval.reduceByKey((a,b) => (a+b))
var HTTPcountsOrdered = HTTPcounts.sortBy(f => f._2, false);
HTTPcountsOrdered.take(5).foreach(println)
Please find error as below.
17/09/19 10:59:38 ERROR scheduler.TaskSetManager: Task 0 in stage 53.0 failed 1 times; aborting joborg.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 53.0 failed 1 times, most recent failure: Lost task 0.0 in stage 53.0 (TID 37, localhost): java.lang.ArrayIndexOutOfBoundsException