From 3d3d30f8286362769084ef49de6178460ebcecbf Mon Sep 17 00:00:00 2001
From: Erik Bernhardson <ebernhardson@wikimedia.org>
Date: Mon, 13 Feb 2023 11:10:21 -0800
Subject: [PATCH 1/3] Fork library to org.wikimedia.search

The github project was last updated in 2017, this project seems
abandoned. The implementation though still seems to work as well today
as it did when written. As part of the search platform migration to
spark3 we need to bring this dependency forward into spark 3 as well.
Fork the library into wikimedia gitlab so we can make the minor
updates necessary for spark 3 support.

* To avoid any conflicts update the groupId in the pom.xml to our own
 group and retain the authors name in the artifactId.
* Setup maven-release-plugin to support releasing to archiva.
* Increment minor version and attach -SNAPSHOT to support release
 plugin workflow.
---
 pom.xml | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/pom.xml b/pom.xml
index 6fa3b03..e14c956 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,9 +1,9 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
   <modelVersion>4.0.0</modelVersion>
-  <version>1.4.1</version>
+  <version>1.5.0-SNAPSHOT</version>
   <name>${project.artifactId}</name>
-  <groupId>sramirez</groupId>
-  <artifactId>spark-MDLP-discretization</artifactId>
+  <groupId>org.wikimedia.search</groupId>
+  <artifactId>sramirez-spark-MDLP-discretization</artifactId>
   <inceptionYear>2016</inceptionYear>
   <properties>
     <maven.compiler.source>1.6</maven.compiler.source>
@@ -12,6 +12,24 @@
     <scala.tools.version>2.11</scala.tools.version>
     <scala.version>2.11.6</scala.version>
   </properties>
+
+  <scm>
+    <developerConnection>scm:git:ssh://git@gitlab.wikimedia.org:repos/search-platform/spark-MDLP-discretization.git</developerConnection>
+  </scm>
+
+  <distributionManagement>
+    <repository>
+      <id>archiva.releases</id>
+      <name>Wikimedia Release Repository</name>
+      <url>https://archiva.wikimedia.org/repository/releases/</url>
+    </repository>
+    <snapshotRepository>
+      <id>archiva.snapshots</id>
+      <name>Wikimedia Snapshot Repository</name>
+      <url>https://archiva.wikimedia.org/repository/snapshots/</url>
+    </snapshotRepository>
+  </distributionManagement>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -42,10 +60,10 @@
          <scope>test</scope>
     </dependency>
     <dependency>
-	  <groupId>joda-time</groupId>
-	  <artifactId>joda-time</artifactId>
-	  <version>2.9.4</version>
-	</dependency>
+      <groupId>joda-time</groupId>
+      <artifactId>joda-time</artifactId>
+      <version>2.9.4</version>
+    </dependency>
   </dependencies>
 
   <build>
@@ -81,13 +99,18 @@
           <useFile>false</useFile>
           <disableXmlReport>true</disableXmlReport>
           <!-- If you have classpath issue like NoDefClassError,... -->
-		  <!-- useManifestOnlyJar>false</useManifestOnlyJar -->
+          <!-- useManifestOnlyJar>false</useManifestOnlyJar -->
           <includes>
             <include>**/*Test.*</include>
             <include>**/*Suite.*</include>
           </includes>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-release-plugin</artifactId>
+        <version>3.0.0-M7</version>
+      </plugin>
     </plugins>
   </build>
 </project>
-- 
GitLab


From 1bb8865d53c4287044a7c039a944ea9698f07bc3 Mon Sep 17 00:00:00 2001
From: Erik Bernhardson <ebernhardson@wikimedia.org>
Date: Mon, 13 Feb 2023 11:14:49 -0800
Subject: [PATCH 2/3] Fix test suite

The upstream test suite doesn't directly pass, update as appropriate
to get things going again.

* FeatureUtils was renamed in 924fd59b1e but one reference in the test suite
 was left behind. Remove that reference.
* Tests for an exact value have been failing since fb222c034e, which
 says it fixed some precision issues. Assume that the fix was correct
 and update the test values to match what comes out.
---
 .../org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala  | 6 +++---
 .../org/apache/spark/ml/feature/ThresholdFinderSuite.scala  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala
index beba44d..12fb62c 100644
--- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala
@@ -260,7 +260,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
         |-Infinity, Infinity;
         |-Infinity, 0.5, Infinity;
         |-Infinity, Infinity;
-        |-Infinity, 1.4435658E12, Infinity
+        |-Infinity, 1.44359817E12, Infinity
         |""".stripMargin.replaceAll(System.lineSeparator(), "")) {
       model.splits.map(a => a.mkString(", ")).mkString(";")
     }
@@ -294,7 +294,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
         |-Infinity, Infinity;
         |-Infinity, 0.5, Infinity;
         |-Infinity, Infinity;
-        |-Infinity, 1.4435658E12, Infinity
+        |-Infinity, 1.44359817E12, Infinity
         |""".stripMargin.replaceAll(System.lineSeparator(), "")) {
       model.splits.map(a => a.mkString(", ")).mkString(";")
     }
@@ -331,7 +331,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
         |-Infinity, 13.825001, 28.2, 41.9896, 47.0, 51.67085, 152.50626, Infinity;
         |-Infinity, 2.5, Infinity;
         |-Infinity, Infinity;
-        |-Infinity, 1.4435658E12, Infinity
+        |-Infinity, 1.44359817E12, Infinity
         |""".stripMargin.replaceAll(System.lineSeparator(), "")) {
       model.splits.map(a => a.mkString(", ")).mkString(";")
     }
diff --git a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala
index 3df75c8..5f693f4 100644
--- a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala
@@ -1,6 +1,6 @@
 package org.apache.spark.ml.feature
 
-import org.apache.spark.mllib.feature.{BucketInfo, FeatureUtils, ThresholdFinder}
+import org.apache.spark.mllib.feature.{BucketInfo, ThresholdFinder}
 import org.apache.spark.sql.SQLContext
 import org.junit.runner.RunWith
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
-- 
GitLab


From 30cdde9b97c583fb7231ddcfd35c268e57a784d1 Mon Sep 17 00:00:00 2001
From: Erik Bernhardson <ebernhardson@wikimedia.org>
Date: Mon, 13 Feb 2023 11:04:16 -0800
Subject: [PATCH 3/3] Migrate to spark3

* Update spark deps to 3.1.2, requires update of scala to 2.12.
* The version of scalatest isn't supported on scala 2.12, update
 to 3.2.15 for scala 2.12 support. Required minor update to test suite
 class declarations but not to the tests themselves.
* The updated version of scalatest doesn't have direct junit support,
 added a dep for scalatestplus:junit to bring it back in. This didn't
 support junit 4.12 which was used, updated to junit 4.13 to match.
* spark3 added methods of handling invalid features (NaN, null) passed
 to VectorAssembler, update test suite to use `keep` which matches the
 test cases usage.
---
 pom.xml                                       | 22 ++++++++++++-------
 .../spark/ml/feature/MDLPDiscretizer.scala    |  2 +-
 .../ml/feature/DiscretizationUtilsSuite.scala |  7 +++---
 .../FewValuesThresholdFinderSuite.scala       |  7 +++---
 .../InitialThresholdsFinderSuite.scala        |  7 +++---
 .../ml/feature/MDLPDiscretizerBigSuite.scala  |  7 +++---
 .../ml/feature/MDLPDiscretizerHugeSuite.scala |  7 +++---
 .../ml/feature/MDLPDiscretizerSuite.scala     |  7 +++---
 .../ManyValuesThresholdFinderSuite.scala      |  7 +++---
 .../apache/spark/ml/feature/TestHelper.scala  |  1 +
 .../ml/feature/ThresholdFinderSuite.scala     |  7 +++---
 11 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/pom.xml b/pom.xml
index e14c956..7c8b75b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -9,8 +9,8 @@
     <maven.compiler.source>1.6</maven.compiler.source>
     <maven.compiler.target>1.6</maven.compiler.target>
     <encoding>UTF-8</encoding>
-    <scala.tools.version>2.11</scala.tools.version>
-    <scala.version>2.11.6</scala.version>
+    <scala.tools.version>2.12</scala.tools.version>
+    <scala.version>2.12.7</scala.version>
   </properties>
 
   <scm>
@@ -33,13 +33,13 @@
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_2.11</artifactId>
-      <version>2.1.1</version>
+      <artifactId>spark-core_2.12</artifactId>
+      <version>3.1.2</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
-      <artifactId>spark-mllib_2.11</artifactId>
-      <version>2.1.1</version>
+      <artifactId>spark-mllib_2.12</artifactId>
+      <version>3.1.2</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
@@ -50,13 +50,19 @@
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_${scala.tools.version}</artifactId>
-      <version>2.2.4</version>
+      <version>3.2.15</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatestplus</groupId>
+      <artifactId>junit-4-13_${scala.tools.version}</artifactId>
+      <version>3.2.15.0</version>
       <scope>test</scope>
     </dependency>
     <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
-         <version>4.12</version>
+         <version>4.13</version>
          <scope>test</scope>
     </dependency>
     <dependency>
diff --git a/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala b/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala
index 73ebb4e..dc5d896 100644
--- a/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala
+++ b/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala
@@ -260,7 +260,7 @@ object DiscretizerModel extends MLReadable[DiscretizerModel] {
             .select("splits")
             .head()
       val model = new DiscretizerModel(metadata.uid, splits)
-      DefaultParamsReader.getAndSetParams(model, metadata)
+      metadata.getAndSetParams(model)
       model
     }
   }
diff --git a/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala b/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala
index 7248dc7..85fa4a5 100644
--- a/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala
@@ -2,8 +2,9 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.mllib.feature.DiscretizationUtils._
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatestplus.junit.JUnitRunner
 
 
 /**
@@ -12,7 +13,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class FeatureUtilsSuite extends FunSuite {
+class FeatureUtilsSuite extends AnyFunSuite {
 
   test("Test entropy calc (typical 1)") {
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala
index 1b77862..54581b9 100644
--- a/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala
@@ -4,8 +4,9 @@ import org.apache.spark.ml.feature.TestHelper._
 import org.apache.spark.mllib.feature.FewValuesThresholdFinder
 import org.apache.spark.sql.{Row, SQLContext}
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatestplus.junit.JUnitRunner
 
 
 /**
@@ -15,7 +16,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class FewValuesThresholdFinderSuite extends FunSuite with BeforeAndAfterAll {
+class FewValuesThresholdFinderSuite extends AnyFunSuite with BeforeAndAfterAll {
 
   var sqlContext: SQLContext = _
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala
index c2f0be7..bf269b7 100644
--- a/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala
@@ -4,8 +4,9 @@ import org.apache.spark.ml.feature.TestHelper._
 import org.apache.spark.mllib.feature.{FewValuesThresholdFinder, InitialThresholdsFinder}
 import org.apache.spark.sql.SQLContext
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatestplus.junit.JUnitRunner
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 
 /**
@@ -15,7 +16,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class InitialThresholdsFinderSuite extends FunSuite with BeforeAndAfterAll {
+class InitialThresholdsFinderSuite extends AnyFunSuite with BeforeAndAfterAll {
 
   var sqlContext: SQLContext = _
   val finder = new InitialThresholdsFinder()
diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala
index d1491cd..edafb01 100644
--- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala
@@ -2,8 +2,9 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatestplus.junit.JUnitRunner
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 import org.apache.spark.ml.feature.TestHelper._
 
 
@@ -13,7 +14,7 @@ import org.apache.spark.ml.feature.TestHelper._
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class MDLPDiscretizerBigSuite extends FunSuite with BeforeAndAfterAll {
+class MDLPDiscretizerBigSuite extends AnyFunSuite with BeforeAndAfterAll {
 
   var sqlContext: SQLContext = null
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala
index 1da4293..5cd378f 100644
--- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala
@@ -3,8 +3,9 @@ package org.apache.spark.ml.feature
 import org.apache.spark.ml.feature.TestHelper._
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatestplus.junit.JUnitRunner
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 
 /**
@@ -13,7 +14,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class MDLPDiscretizerHugeSuite extends FunSuite with BeforeAndAfterAll {
+class MDLPDiscretizerHugeSuite extends AnyFunSuite with BeforeAndAfterAll {
 
   var sqlContext: SQLContext = null
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala
index 12fb62c..b55d72e 100644
--- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala
@@ -2,8 +2,9 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.junit.runner.RunWith
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.scalatest.junit.JUnitRunner
+import org.scalatestplus.junit.JUnitRunner
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 import TestHelper._
 
 
@@ -13,7 +14,7 @@ import TestHelper._
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll {
+class MDLPDiscretizerSuite extends AnyFunSuite with BeforeAndAfterAll {
 
   var sqlContext: SQLContext = _
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala
index aa437ec..ac3fdf1 100644
--- a/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala
@@ -4,8 +4,9 @@ import org.apache.spark.ml.feature.TestHelper._
 import org.apache.spark.mllib.feature.ManyValuesThresholdFinder
 import org.apache.spark.sql.SQLContext
 import org.junit.runner.RunWith
-import org.scalatest.junit.JUnitRunner
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatestplus.junit.JUnitRunner
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
 
 
 /**
@@ -15,7 +16,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class ManyValuesThresholdFinderSuite extends FunSuite with BeforeAndAfterAll {
+class ManyValuesThresholdFinderSuite extends AnyFunSuite with BeforeAndAfterAll {
 
   var sqlContext: SQLContext = _
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala b/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala
index fd82aa3..70a2831 100644
--- a/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala
@@ -36,6 +36,7 @@ object TestHelper {
                              approximate: Boolean = false): DiscretizerModel = {
     val featureAssembler = new VectorAssembler()
       .setInputCols(inputCols)
+      .setHandleInvalid("keep")
       .setOutputCol("features")
     val processedDf = featureAssembler.transform(dataframe)
 
diff --git a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala
index 5f693f4..4974dd3 100644
--- a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala
+++ b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala
@@ -3,8 +3,9 @@ package org.apache.spark.ml.feature
 import org.apache.spark.mllib.feature.{BucketInfo, ThresholdFinder}
 import org.apache.spark.sql.SQLContext
 import org.junit.runner.RunWith
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.scalatest.junit.JUnitRunner
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatestplus.junit.JUnitRunner
 
 
 /**
@@ -13,7 +14,7 @@ import org.scalatest.junit.JUnitRunner
   * @author Barry Becker
   */
 @RunWith(classOf[JUnitRunner])
-class ThresholdFinderSuite extends FunSuite {
+class ThresholdFinderSuite extends AnyFunSuite {
 
 
   test("Test calcCriterion with even split hence low criterion value (and high entropy)") {
-- 
GitLab