From 3d3d30f8286362769084ef49de6178460ebcecbf Mon Sep 17 00:00:00 2001 From: Erik Bernhardson <ebernhardson@wikimedia.org> Date: Mon, 13 Feb 2023 11:10:21 -0800 Subject: [PATCH 1/3] Fork library to org.wikimedia.search The github project was last updated in 2017, this project seems abandoned. The implementation though still seems to work as well today as it did when written. As part of the search platform migration to spark3 we need to bring this dependency forward into spark 3 as well. Fork the library into wikimedia gitlab so we can make the minor updates necessary for spark 3 support. * To avoid any conflicts update the groupId in the pom.xml to our own group and retain the authors name in the artifactId. * Setup maven-release-plugin to support releasing to archiva. * Increment minor version and attach -SNAPSHOT to support release plugin workflow. --- pom.xml | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 6fa3b03..e14c956 100644 --- a/pom.xml +++ b/pom.xml @@ -1,9 +1,9 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> - <version>1.4.1</version> + <version>1.5.0-SNAPSHOT</version> <name>${project.artifactId}</name> - <groupId>sramirez</groupId> - <artifactId>spark-MDLP-discretization</artifactId> + <groupId>org.wikimedia.search</groupId> + <artifactId>sramirez-spark-MDLP-discretization</artifactId> <inceptionYear>2016</inceptionYear> <properties> <maven.compiler.source>1.6</maven.compiler.source> @@ -12,6 +12,24 @@ <scala.tools.version>2.11</scala.tools.version> <scala.version>2.11.6</scala.version> </properties> + + <scm> + <developerConnection>scm:git:ssh://git@gitlab.wikimedia.org:repos/search-platform/spark-MDLP-discretization.git</developerConnection> + </scm> + + <distributionManagement> + <repository> + <id>archiva.releases</id> + <name>Wikimedia Release Repository</name> + <url>https://archiva.wikimedia.org/repository/releases/</url> + </repository> + <snapshotRepository> + <id>archiva.snapshots</id> + <name>Wikimedia Snapshot Repository</name> + <url>https://archiva.wikimedia.org/repository/snapshots/</url> + </snapshotRepository> + </distributionManagement> + <dependencies> <dependency> <groupId>org.apache.spark</groupId> @@ -42,10 +60,10 @@ <scope>test</scope> </dependency> <dependency> - <groupId>joda-time</groupId> - <artifactId>joda-time</artifactId> - <version>2.9.4</version> - </dependency> + <groupId>joda-time</groupId> + <artifactId>joda-time</artifactId> + <version>2.9.4</version> + </dependency> </dependencies> <build> @@ -81,13 +99,18 @@ <useFile>false</useFile> <disableXmlReport>true</disableXmlReport> <!-- If you have classpath issue like NoDefClassError,... --> - <!-- useManifestOnlyJar>false</useManifestOnlyJar --> + <!-- useManifestOnlyJar>false</useManifestOnlyJar --> <includes> <include>**/*Test.*</include> <include>**/*Suite.*</include> </includes> </configuration> </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-release-plugin</artifactId> + <version>3.0.0-M7</version> + </plugin> </plugins> </build> </project> -- GitLab From 1bb8865d53c4287044a7c039a944ea9698f07bc3 Mon Sep 17 00:00:00 2001 From: Erik Bernhardson <ebernhardson@wikimedia.org> Date: Mon, 13 Feb 2023 11:14:49 -0800 Subject: [PATCH 2/3] Fix test suite The upstream test suite doesn't directly pass, update as appropriate to get things going again. * FeatureUtils was renamed in 924fd59b1e but one reference in the test suite was left behind. Remove that reference. * Tests for an exact value have been failing since fb222c034e, which says it fixed some precision issues. Assume that the fix was correct and update the test values to match what comes out. --- .../org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala | 6 +++--- .../org/apache/spark/ml/feature/ThresholdFinderSuite.scala | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala index beba44d..12fb62c 100644 --- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala @@ -260,7 +260,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll { |-Infinity, Infinity; |-Infinity, 0.5, Infinity; |-Infinity, Infinity; - |-Infinity, 1.4435658E12, Infinity + |-Infinity, 1.44359817E12, Infinity |""".stripMargin.replaceAll(System.lineSeparator(), "")) { model.splits.map(a => a.mkString(", ")).mkString(";") } @@ -294,7 +294,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll { |-Infinity, Infinity; |-Infinity, 0.5, Infinity; |-Infinity, Infinity; - |-Infinity, 1.4435658E12, Infinity + |-Infinity, 1.44359817E12, Infinity |""".stripMargin.replaceAll(System.lineSeparator(), "")) { model.splits.map(a => a.mkString(", ")).mkString(";") } @@ -331,7 +331,7 @@ class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll { |-Infinity, 13.825001, 28.2, 41.9896, 47.0, 51.67085, 152.50626, Infinity; |-Infinity, 2.5, Infinity; |-Infinity, Infinity; - |-Infinity, 1.4435658E12, Infinity + |-Infinity, 1.44359817E12, Infinity |""".stripMargin.replaceAll(System.lineSeparator(), "")) { model.splits.map(a => a.mkString(", ")).mkString(";") } diff --git a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala index 3df75c8..5f693f4 100644 --- a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala @@ -1,6 +1,6 @@ package org.apache.spark.ml.feature -import org.apache.spark.mllib.feature.{BucketInfo, FeatureUtils, ThresholdFinder} +import org.apache.spark.mllib.feature.{BucketInfo, ThresholdFinder} import org.apache.spark.sql.SQLContext import org.junit.runner.RunWith import org.scalatest.{BeforeAndAfterAll, FunSuite} -- GitLab From 30cdde9b97c583fb7231ddcfd35c268e57a784d1 Mon Sep 17 00:00:00 2001 From: Erik Bernhardson <ebernhardson@wikimedia.org> Date: Mon, 13 Feb 2023 11:04:16 -0800 Subject: [PATCH 3/3] Migrate to spark3 * Update spark deps to 3.1.2, requires update of scala to 2.12. * The version of scalatest isn't supported on scala 2.12, update to 3.2.15 for scala 2.12 support. Required minor update to test suite class declarations but not to the tests themselves. * The updated version of scalatest doesn't have direct junit support, added a dep for scalatestplus:junit to bring it back in. This didn't support junit 4.12 which was used, updated to junit 4.13 to match. * spark3 added methods of handling invalid features (NaN, null) passed to VectorAssembler, update test suite to use `keep` which matches the test cases usage. --- pom.xml | 22 ++++++++++++------- .../spark/ml/feature/MDLPDiscretizer.scala | 2 +- .../ml/feature/DiscretizationUtilsSuite.scala | 7 +++--- .../FewValuesThresholdFinderSuite.scala | 7 +++--- .../InitialThresholdsFinderSuite.scala | 7 +++--- .../ml/feature/MDLPDiscretizerBigSuite.scala | 7 +++--- .../ml/feature/MDLPDiscretizerHugeSuite.scala | 7 +++--- .../ml/feature/MDLPDiscretizerSuite.scala | 7 +++--- .../ManyValuesThresholdFinderSuite.scala | 7 +++--- .../apache/spark/ml/feature/TestHelper.scala | 1 + .../ml/feature/ThresholdFinderSuite.scala | 7 +++--- 11 files changed, 48 insertions(+), 33 deletions(-) diff --git a/pom.xml b/pom.xml index e14c956..7c8b75b 100644 --- a/pom.xml +++ b/pom.xml @@ -9,8 +9,8 @@ <maven.compiler.source>1.6</maven.compiler.source> <maven.compiler.target>1.6</maven.compiler.target> <encoding>UTF-8</encoding> - <scala.tools.version>2.11</scala.tools.version> - <scala.version>2.11.6</scala.version> + <scala.tools.version>2.12</scala.tools.version> + <scala.version>2.12.7</scala.version> </properties> <scm> @@ -33,13 +33,13 @@ <dependencies> <dependency> <groupId>org.apache.spark</groupId> - <artifactId>spark-core_2.11</artifactId> - <version>2.1.1</version> + <artifactId>spark-core_2.12</artifactId> + <version>3.1.2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> - <artifactId>spark-mllib_2.11</artifactId> - <version>2.1.1</version> + <artifactId>spark-mllib_2.12</artifactId> + <version>3.1.2</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> @@ -50,13 +50,19 @@ <dependency> <groupId>org.scalatest</groupId> <artifactId>scalatest_${scala.tools.version}</artifactId> - <version>2.2.4</version> + <version>3.2.15</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatestplus</groupId> + <artifactId>junit-4-13_${scala.tools.version}</artifactId> + <version>3.2.15.0</version> <scope>test</scope> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.12</version> + <version>4.13</version> <scope>test</scope> </dependency> <dependency> diff --git a/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala b/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala index 73ebb4e..dc5d896 100644 --- a/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala +++ b/src/main/scala/org/apache/spark/ml/feature/MDLPDiscretizer.scala @@ -260,7 +260,7 @@ object DiscretizerModel extends MLReadable[DiscretizerModel] { .select("splits") .head() val model = new DiscretizerModel(metadata.uid, splits) - DefaultParamsReader.getAndSetParams(model, metadata) + metadata.getAndSetParams(model) model } } diff --git a/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala b/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala index 7248dc7..85fa4a5 100644 --- a/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/DiscretizationUtilsSuite.scala @@ -2,8 +2,9 @@ package org.apache.spark.ml.feature import org.apache.spark.mllib.feature.DiscretizationUtils._ import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner /** @@ -12,7 +13,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite} * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class FeatureUtilsSuite extends FunSuite { +class FeatureUtilsSuite extends AnyFunSuite { test("Test entropy calc (typical 1)") { diff --git a/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala index 1b77862..54581b9 100644 --- a/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/FewValuesThresholdFinderSuite.scala @@ -4,8 +4,9 @@ import org.apache.spark.ml.feature.TestHelper._ import org.apache.spark.mllib.feature.FewValuesThresholdFinder import org.apache.spark.sql.{Row, SQLContext} import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner /** @@ -15,7 +16,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite} * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class FewValuesThresholdFinderSuite extends FunSuite with BeforeAndAfterAll { +class FewValuesThresholdFinderSuite extends AnyFunSuite with BeforeAndAfterAll { var sqlContext: SQLContext = _ diff --git a/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala index c2f0be7..bf269b7 100644 --- a/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/InitialThresholdsFinderSuite.scala @@ -4,8 +4,9 @@ import org.apache.spark.ml.feature.TestHelper._ import org.apache.spark.mllib.feature.{FewValuesThresholdFinder, InitialThresholdsFinder} import org.apache.spark.sql.SQLContext import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.scalatestplus.junit.JUnitRunner +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite /** @@ -15,7 +16,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite} * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class InitialThresholdsFinderSuite extends FunSuite with BeforeAndAfterAll { +class InitialThresholdsFinderSuite extends AnyFunSuite with BeforeAndAfterAll { var sqlContext: SQLContext = _ val finder = new InitialThresholdsFinder() diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala index d1491cd..edafb01 100644 --- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerBigSuite.scala @@ -2,8 +2,9 @@ package org.apache.spark.ml.feature import org.apache.spark.sql.{DataFrame, SQLContext} import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.scalatestplus.junit.JUnitRunner +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.ml.feature.TestHelper._ @@ -13,7 +14,7 @@ import org.apache.spark.ml.feature.TestHelper._ * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class MDLPDiscretizerBigSuite extends FunSuite with BeforeAndAfterAll { +class MDLPDiscretizerBigSuite extends AnyFunSuite with BeforeAndAfterAll { var sqlContext: SQLContext = null diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala index 1da4293..5cd378f 100644 --- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerHugeSuite.scala @@ -3,8 +3,9 @@ package org.apache.spark.ml.feature import org.apache.spark.ml.feature.TestHelper._ import org.apache.spark.sql.{DataFrame, SQLContext} import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.scalatestplus.junit.JUnitRunner +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite /** @@ -13,7 +14,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite} * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class MDLPDiscretizerHugeSuite extends FunSuite with BeforeAndAfterAll { +class MDLPDiscretizerHugeSuite extends AnyFunSuite with BeforeAndAfterAll { var sqlContext: SQLContext = null diff --git a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala index 12fb62c..b55d72e 100644 --- a/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/MDLPDiscretizerSuite.scala @@ -2,8 +2,9 @@ package org.apache.spark.ml.feature import org.apache.spark.sql.{DataFrame, SQLContext} import org.junit.runner.RunWith -import org.scalatest.{BeforeAndAfterAll, FunSuite} -import org.scalatest.junit.JUnitRunner +import org.scalatestplus.junit.JUnitRunner +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite import TestHelper._ @@ -13,7 +14,7 @@ import TestHelper._ * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class MDLPDiscretizerSuite extends FunSuite with BeforeAndAfterAll { +class MDLPDiscretizerSuite extends AnyFunSuite with BeforeAndAfterAll { var sqlContext: SQLContext = _ diff --git a/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala index aa437ec..ac3fdf1 100644 --- a/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/ManyValuesThresholdFinderSuite.scala @@ -4,8 +4,9 @@ import org.apache.spark.ml.feature.TestHelper._ import org.apache.spark.mllib.feature.ManyValuesThresholdFinder import org.apache.spark.sql.SQLContext import org.junit.runner.RunWith -import org.scalatest.junit.JUnitRunner -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.scalatestplus.junit.JUnitRunner +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite /** @@ -15,7 +16,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite} * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class ManyValuesThresholdFinderSuite extends FunSuite with BeforeAndAfterAll { +class ManyValuesThresholdFinderSuite extends AnyFunSuite with BeforeAndAfterAll { var sqlContext: SQLContext = _ diff --git a/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala b/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala index fd82aa3..70a2831 100644 --- a/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala +++ b/src/test/scala/org/apache/spark/ml/feature/TestHelper.scala @@ -36,6 +36,7 @@ object TestHelper { approximate: Boolean = false): DiscretizerModel = { val featureAssembler = new VectorAssembler() .setInputCols(inputCols) + .setHandleInvalid("keep") .setOutputCol("features") val processedDf = featureAssembler.transform(dataframe) diff --git a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala index 5f693f4..4974dd3 100644 --- a/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala +++ b/src/test/scala/org/apache/spark/ml/feature/ThresholdFinderSuite.scala @@ -3,8 +3,9 @@ package org.apache.spark.ml.feature import org.apache.spark.mllib.feature.{BucketInfo, ThresholdFinder} import org.apache.spark.sql.SQLContext import org.junit.runner.RunWith -import org.scalatest.{BeforeAndAfterAll, FunSuite} -import org.scalatest.junit.JUnitRunner +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite +import org.scalatestplus.junit.JUnitRunner /** @@ -13,7 +14,7 @@ import org.scalatest.junit.JUnitRunner * @author Barry Becker */ @RunWith(classOf[JUnitRunner]) -class ThresholdFinderSuite extends FunSuite { +class ThresholdFinderSuite extends AnyFunSuite { test("Test calcCriterion with even split hence low criterion value (and high entropy)") { -- GitLab