from pyspark.ml.function import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.analysis import RegressionEvaluator
# Step 1: Initialize Spark session
spark = SparkSession.builder
.appName(“Linear Regression with PySpark”)
.getOrCreate()
# Step 2: Load knowledge from CSV
data_path = “your_data.csv” # Change along with your CSV file path
knowledge = spark.learn.csv(data_path, header=True, inferSchema=True)
# Step 3: Examine knowledge
knowledge.printSchema()
knowledge.present(5)
# Step 4: Function engineering (Construct the function vector)
# Assuming the CSV has columns like ‘feature1’, ‘feature2’, …, and ‘goal’
feature_columns = [‘feature1’, ‘feature2’, ‘feature3’] # Change along with your function columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol=”options”)
# Step 5: Outline the Linear Regression mannequin
lr = LinearRegression(featuresCol=”options”, labelCol=”goal”) # Change ‘goal’ along with your goal column
# Step 6: Construct a pipeline
pipeline = Pipeline(levels=[assembler, lr])
# Step 7: Break up knowledge into coaching and check units
train_data, test_data = knowledge.randomSplit([0.8, 0.2], seed=42)
# Step 8: Prepare the mannequin
mannequin = pipeline.match(train_data)
# Step 9: Save the mannequin
model_save_path = “path_to_save_model” # Change with desired path
mannequin.write().overwrite().save(model_save_path)
# Step 10: Load the saved mannequin
from pyspark.ml.pipeline import PipelineModel
loaded_model = PipelineModel.load(model_save_path)
# Step 11: Make predictions on check knowledge
predictions = loaded_model.remodel(test_data)
# Step 12: Consider the mannequin
evaluator = RegressionEvaluator(labelCol=”goal”, predictionCol=”prediction”, metricName=”rmse”)
rmse = evaluator.consider(predictions)
print(f”Root Imply Squared Error (RMSE): {rmse}”)
# Step 13: Present predictions
predictions.choose(“options”, “goal”, “prediction”).present(5)
# Cease Spark session
spark.cease()