PySpark_WordCount

PySpark Word Count

Demonstration of PySpark using AWS Elastic Compute 2

Start Spark Services: ./sbin/start-all.sh

Submit Job: ./bin/spark-submit WordCount.py

Stop Services: ./sbin/stop-all.sh

In [ ]:
from pyspark import SparkContext

sc = SparkContext()

input_file = sc.textFile("file:///home/ec2-user/data/InputCorpus.txt")

flat_lines = input_file.flatMap(lambda line: line.split(" "))

map_words = flat_lines.map(lambda word: (word, 1))

counts = map_words.reduceByKey(lambda a, b: a + b)

counts.saveAsTextFile("file:///home/ec2-user/data/WordCountOutput")