add a file

武翔
Commit a42e10e7de07ba679381499d6af9a7493616b009 a42e10e7 1 parent 0b5e237d
Showing 1 changed file with 62 additions and 0 deletions
test/kmeans_example.py
--- a/test/kmeans_example.py 0 → 100644
View file @a42e10e
+++ b/test/kmeans_example.py 0 → 100644
View file @a42e10e
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.clustering import KMeans
+# $example off$
+
+from pyspark.sql import SparkSession
+
+"""
+An example demonstrating k-means clustering.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+
+
+if __name__ == "__main__":
+
+    spark = SparkSession\
+        .builder\
+        .appName("PythonKMeansExample")\
+        .getOrCreate()
+
+    # 例子
+    # 加载数据
+    dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")#加载数据
+
+    # Trains a k-means model.
+    kmeans = KMeans().setK(2).setSeed(1)  #k-means 模型
+    model = kmeans.fit(dataset)		#建立模型
+
+    # Evaluate clustering by computing Within Set Sum of Squared Errors.通过计算误差项平方和内的聚类分析
+    wssse = model.computeCost(dataset)
+    print("Within Set Sum of Squared Errors = " + str(wssse))
+
+    # 显示结果  clusterCenters 聚类中心
+    centers = model.clusterCenters()
+    print("Cluster Centers: ")
+    for center in centers:
+        print(center)
+    # $example off$
+
+    spark.stop()