add a file

武翔
Commit a42e10e7de07ba679381499d6af9a7493616b009 a42e10e7 1 parent 0b5e237d
Showing 1 changed file with 62 additions and 0 deletions
test/kmeans_example.py
--- a/test/kmeans_example.py 0 → 100644
View file @a42e10e
+++ b/test/kmeans_example.py 0 → 100644
View file @a42e10e
+ #
+ # Licensed to the Apache Software Foundation (ASF) under one or more
+ # contributor license agreements.  See the NOTICE file distributed with
+ # this work for additional information regarding copyright ownership.
+ # The ASF licenses this file to You under the Apache License, Version 2.0
+ # (the "License"); you may not use this file except in compliance with
+ # the License.  You may obtain a copy of the License at
+ #
+ #    http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ #
+ #!/usr/bin/env python
+ # -*- coding: utf-8 -*-
+ from __future__ import print_function
+ 
+ # $example on$
+ from pyspark.ml.clustering import KMeans
+ # $example off$
+ 
+ from pyspark.sql import SparkSession
+ 
+ """
+ An example demonstrating k-means clustering.
+ Run with:
+   bin/spark-submit examples/src/main/python/ml/kmeans_example.py
+ 
+ This example requires NumPy (http://www.numpy.org/).
+ """
+ 
+ 
+ if __name__ == "__main__":
+ 
+     spark = SparkSession\
+         .builder\
+         .appName("PythonKMeansExample")\
+         .getOrCreate()
+ 
+     # 例子
+     # 加载数据
+     dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")#加载数据
+ 
+     # Trains a k-means model.
+     kmeans = KMeans().setK(2).setSeed(1)  #k-means 模型
+     model = kmeans.fit(dataset)		#建立模型
+ 
+     # Evaluate clustering by computing Within Set Sum of Squared Errors.通过计算误差项平方和内的聚类分析
+     wssse = model.computeCost(dataset)
+     print("Within Set Sum of Squared Errors = " + str(wssse))
+ 
+     # 显示结果  clusterCenters 聚类中心
+     centers = model.clusterCenters()
+     print("Cluster Centers: ")
+     for center in centers:
+         print(center)
+     # $example off$
+ 
+     spark.stop()