36 lines
1.0 KiB
Python
36 lines
1.0 KiB
Python
from pyspark.sql import SparkSession
|
|
from pyspark.sql.functions import explode
|
|
from pyspark.sql.functions import split
|
|
|
|
if __name__ == "__main__":
|
|
spark = SparkSession \
|
|
.builder \
|
|
.appName("StructuredNetworkWordCount") \
|
|
.getOrCreate()
|
|
print("Spark version: ", spark.version)
|
|
# Create DataFrame representing the stream of input lines from connection to localhost:9999
|
|
lines = spark \
|
|
.readStream \
|
|
.format("socket") \
|
|
.option("host", "niit-node2") \
|
|
.option("port", 9092) \
|
|
.load()
|
|
# Split the lines into words
|
|
words = lines.select(
|
|
explode(
|
|
split(lines.value, " ")
|
|
).alias("word")
|
|
)
|
|
|
|
# Generate running word count
|
|
wordCounts = words.groupBy("word").count()
|
|
|
|
print("wordCounts: ", wordCounts)
|
|
# Start running the query that prints the running counts to the console
|
|
query = wordCounts \
|
|
.writeStream \
|
|
.outputMode("complete") \
|
|
.format("console") \
|
|
.start()
|
|
|
|
query.awaitTermination() |