Spark-MLlib-特征抽取

2023年9月18日 212次阅读来源: Codlife一王家二公子
参考资料：spark MLlib 官方文档
package lean_mllib

//import breeze.linalg.PCA
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vectors

/**
  * Created by wjf on 2016/8/15.
  */
object TestCountVector {

  val spark=MLLibConf.spark
  def main(args: Array[String]): Unit = {
  /*  val df= spark.createDataFrame(Seq(
      (0,Array("a","b","C","c")),
      (1,Array("a","a","b","b","c","C"))
    )).toDF("id","words")

    val cvModel:CountVectorizerModel =new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(3).setMinDF(2).fit(df)

    val cvm =new CountVectorizerModel(Array("a","b","c")).setInputCol("words").setOutputCol("features")
    cvModel.transform(df).select("features","words").show(false)*/

//    removeStopWord()
//    nGram()
//    testPCA()
//    testPolynomialExpansion()

//    testDiscreteCosineTransform()

//    testStringIndexer()
    testOneHotCoder()

  }


  def removeStopWord(): Unit ={
    val remover =new StopWordsRemover().setInputCol("raw").setOutputCol("filtered")
    val dataSet =spark.createDataFrame(Seq(
      (0,Seq("I","saw","the","red","baloon")),
      (1,Seq("Marry","had","a","little","lamb"))
    )).toDF("id","raw")
    remover.transform(dataSet).show()
  }

  def nGram(): Unit ={
    val wordDataFrame =spark.createDataFrame(Seq(
      (0,Array("Hi","I","heard","about","Spark")),
      (1,Array("I","wish","Java","Could","use","case","classes")),
      (2,Array("Logistic","regression","models","are","neat"))
    )).toDF("label","words")

    val ngram =new NGram().setInputCol("words").setOutputCol("ngrams")
    val ngramDataFrame =ngram.setN(10).transform(wordDataFrame)
    ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)

  }
  def testPCA(): Unit ={
    val data =Array(
      Vectors.sparse(5,Seq((1,1.0),(3,7.0))),
      Vectors.dense(2.0,0.0,3.0,4.0,5.0),
      Vectors.dense(4.0,0.0,0.0,6.0,7.0)
    )
    val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("label","features")
    df.take(10).foreach(println)
    val pca=new PCA().setInputCol("features").setOutputCol("pcaFeatures").setK(1).fit(df)
    val pcaDF =pca.transform(df)
    pcaDF.take(10).foreach(println)
    val result = pcaDF.select("pcaFeatures","label")
    result.show()
  }
  def testPolynomialExpansion(): Unit ={
    val data= Array(
      Vectors.dense(-2.0,2.3),
      Vectors.dense(0.0,0.0),
      Vectors.dense(0.6,-1.1)
    )
    val df =spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    val polynomialExpansion =new PolynomialExpansion().setInputCol("features").setOutputCol("polyFeatures").setDegree(3)
    val polyDF = polynomialExpansion.transform(df)
    polyDF.select("polyFeatures").take(3).foreach(println)
  }

  def testDiscreteCosineTransform(): Unit ={
    val data =Seq(
      Vectors.dense(0.0,1.0,-2.0,3.0),
      Vectors.dense(-1.0,2.0,4.0,-7.0),
      Vectors.dense(14.0,-2.0,-5.0,1.0)
    )
    data.foreach(println)

    val df=spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    // take some cow to the driver program. if the n is too large ,the driver program may be crash
    df.take(10).foreach(println)
    val dct =new DCT().setInputCol("features").setOutputCol("featuresDCT").setInverse(false)

    val dctDF = dct.transform(df)

    dctDF.select("featuresDCT").show(3)

  }


  def testStringIndexer(): Unit ={
    val df =spark.createDataFrame(Seq(
      (0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c")
    )).toDF("id","category")

    df.take(6).foreach(println)
    val indexer =new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)

    val indexed = indexer.transform(df)
    indexed.take(6).foreach(println)

    val converter =new IndexToString().setInputCol("categoryIndex").setOutputCol("originalCategory")

    val converted = converter.transform(indexed)
    converted.select("id","categoryIndex","originalCategory").show()

  }
  def testOneHotCoder(): Unit ={
    val df = spark.createDataFrame(Seq(
      (0,"a"),(1,"b"),(2,"c"),(3,"a"),(4,"a"),(5,"c"),(6,"b")
    )).toDF("id","category")
    val indexer = new StringIndexer().setInputCol("category").setOutputCol("categoryIndex").fit(df)

    val indexed = indexer.transform(df)
    val encoder =new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec")

    val encoded = encoder.transform(indexed)
    encoded.select("id","categoryVec").show()

  }

}
    原文作者：Codlife一王家二公子
    原文地址: https://www.jianshu.com/p/48a668ffb010
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。