spark/testconnect.py
2025-01-02 17:32:13 +08:00

27 lines
1.0 KiB
Python

from pyspark.sql import SparkSession
import os
os.environ['JAVA_HOME'] = 'D:\CodeDevelopment\DevelopmentEnvironment\Java\jdk-17.0.5'
os.environ['HADOOP_HOME'] = 'D:\CodeDevelopment\DevelopmentEnvironment\hadoop-2.8.1'
# 创建 SparkSession
spark = SparkSession \
.builder \
.appName("Kafka Example") \
.master("local[*]") \
.config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2") \
.config("spark.executorEnv.PATH", "D:\CodeDevelopment\DevelopmentEnvironment\Java\jdk-17.0.5") \
.config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()
# 读取 Kafka 数据
df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "niit-node2:9092").option("subscribe", "orders").load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
# 展示数据
df.show()