package com.alok.projects
import org.apache.spark.sql.SparkSession
object entry {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName(“String Analysis”)
.config(“spark.master”, “local”)
.getOrCreate()
val novel = spark.sparkContext.textFile(“src/resources/big.txt”)
val stopWords = Set(“a”, “about”, “above”, “after”, “again”, “against”, “all”, “am”, “an”, “and”, “any”, “are”, “as”,
“at”, “be”, “because”, “been”, “before”, “being”, “below”, “between”, “both”, “but”, “by”, “could”, “did”, “do”,
“does”, “doing”, “down”, “during”, “each”, “few”, “for”, “from”, “further”, “had”, “has”, “have”, “having”, “he”,
“he’d”, “he’ll”, “he’s”, “her”, “here”, “here’s”, “hers”, “herself”, “him”, “himself”, “his”, “how”, “how’s”, “i”,
“i’d”, “i’ll”, “i’m”, “i’ve”, “if”, “in”, “into”, “is”, “it”, “it’s”, “its”, “itself”, “let’s”, “me”, “more”,
“most”, “my”, “myself”, “nor”, “of”, “on”, “once”, “only”, “or”, “other”, “ought”, “our”, “ours”, “ourselves”,
“out”, “over”, “own”, “same”, “she”, “she’d”, “she’ll”, “she’s”, “should”, “so”, “some”, “such”, “than”, “that”,
“that’s”, “the”, “their”, “theirs”, “them”, “themselves”, “then”, “there”, “there’s”, “these”, “they”, “they’d”,
“they’ll”, “they’re”, “they’ve”, “this”, “those”, “through”, “to”, “too”, “under”, “until”, “up”, “very”, “was”,
“we”, “we’d”, “we’ll”, “we’re”, “we’ve”, “were”, “what”, “what’s”, “when”, “when’s”, “where”, “where’s”, “which”,
“while”, “who”, “who’s”, “whom”, “why”, “why’s”, “with”, “would”, “you”, “you’d”, “you’ll”, “you’re”, “you’ve”,
“your”, “yours”, “yourself”, “yourselves”, “may”, “no”, “not”,“now”,“will”,“must”,“can”)
val novel_words_cleaned_tuple = novel.flatMap(x => x.split(" “))
.map(c => c.replaceAll(”[^a-zA-Z0-9]+", “”))
.map(_.toLowerCase)
.filter(x => !stopWords.contains(x) && x != “”).distinct()
//.map(x => (x,1))
novel_words_cleaned_tuple.take(10).foreach(println)
novel_words_cleaned_tuple.map(x => (x.split("").sorted.toList,List(x))).reduceByKey( ++ ).filter(x => x._2.length > 1).take(10).foreach(println)
The output:
(List(a, i, l, r, t),List(trial, trail))
(List(a, e, l, r, r, y),List(larrey, rarely))
(List(e, g, n, o, o, r, r, s, v),List(grosvenor, governors))
(List(d, e, e, f, m, o, p, r, r),List(performed, preformed))
(List(0, 2, 7),List(270, 207))
(List(d, e, i, k, l, n),List(linked, kindle))
(List(a, b, g, r),List(brag, garb, grab))
(List(d, e, e, e, p, s, t),List(deepest, deepset))
(List(a, c, e, h, s, t),List(sacthe, chaste))
(List(a, e, e, g, m, r),List(meagre, meager))