twitter · ibenian · Feb 25, 2018 · Mar 3, 2018
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala b/algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
@@ -26,6 +26,44 @@ object MinHasher {
     (hashes, bands)
   }
 
+  // Providing an extension method on MinHasher[H] to preserve binary compatibility
+  implicit class similarityMultiExtender[H](val h: MinHasher[H]) extends AnyVal {
+    /**
+     * Generalized Jaccard similarity estimation for multiple sets (size of intersection / size of union).
+     * Jsim(S1..Sn) = P(hmin1 == hmin2 == ... == hminn) / numHashes
+     */
+    def similarityMulti(sigs: MinHashSignature*): Double =
+      h match {
+        case h32: MinHasher32 => MinHasher32.similarityMulti(h32, sigs: _*)
+        case h16: MinHasher16 => MinHasher16.similarityMulti(h16, sigs: _*)
+      }
+  }
+}
+
+object MinHasher32 {
+  private def buildArrayMulti(h: MinHasher32, hasherbuffers: Seq[Array[Byte]])(fn: Seq[Int] => Int): Array[Byte] = {
+    val intBuffers = hasherbuffers.map(b => ByteBuffer.wrap(b).asIntBuffer)
+    h.buildArray(fn(intBuffers.map(_.get).toVector))
+  }
+
+  def similarityMulti(h: MinHasher32, sigs: MinHashSignature*)(implicit n: Numeric[Int]): Double = {
+    buildArrayMulti(h, sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero)
+      .map(_.toDouble)
+      .sum / h.numHashes
+  }
+}
+
+object MinHasher16 {
+  private def buildArrayMulti(h: MinHasher16, hasherbuffers: Seq[Array[Byte]])(fn: Seq[Char] => Char): Array[Byte] = {
+    val charBuffers = hasherbuffers.map(b => ByteBuffer.wrap(b).asCharBuffer)
+    h.buildArray(fn(charBuffers.map(_.get).toVector))
+  }
+
+  def similarityMulti(h: MinHasher16, sigs: MinHashSignature*)(implicit n: Numeric[Char]): Double = {
+    buildArrayMulti(h, sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero)
+      .map(_.toDouble)
+      .sum / h.numHashes
+  }
 }
 
 /**

diff --git a/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/MinHasherTest.scala
@@ -31,6 +31,16 @@ class MinHasherSpec extends WordSpec with Matchers {
     assert(error < epsilon)
   }
 
+  def testMulti[H](mh: MinHasher[H], sets: Int, samples: Int, similarity: Double, epsilon: Double) = {
+    val randomSets = randomMultiSets(sets, samples, similarity)
+
+    val exact = exactSimilarityMulti(randomSets)
+    val sim = approxSimilarityMulti(mh, randomSets)
+    val error: Double = math.abs(exact - sim)
+    assert(error < epsilon)
+    info(s"sets: $sets, exact: $exact, sim: $sim, error: $error, epsion: $epsilon")
+  }
+
   def randomSets(similarity: Double) = {
     val s = 10000
     val uniqueFraction = if (similarity == 1.0) 0.0 else (1 - similarity) / (1 + similarity)
@@ -42,16 +52,39 @@ class MinHasherSpec extends WordSpec with Matchers {
     (unique1 ++ shared, unique2 ++ shared)
   }
 
+  def randomMultiSets(sets: Int, samples: Int, similarity: Double) = {
+    val sharedSamples = similarity * samples
+    val uniqueSamples = samples - sharedSamples
+
+    val shared = 1.to(sharedSamples.toInt).map{ i => math.random }.toSet
+    for {
+      i <- 1 to sets
+      unique = 1.to((uniqueSamples / sets).toInt).map{ i => math.random }.toSet
+    } yield unique ++ shared
+  }
+
   def exactSimilarity[T](x: Set[T], y: Set[T]) = {
     (x & y).size.toDouble / (x ++ y).size
   }
 
+  def exactSimilarityMulti[T](sets: Seq[Set[T]]) = {
+    sets.reduce(_ & _).size.toDouble / sets.reduce(_ | _).size.toDouble
+  }
+
   def approxSimilarity[T, H](mh: MinHasher[H], x: Set[T], y: Set[T]) = {
     val sig1 = x.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
     val sig2 = y.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
     mh.similarity(sig1, sig2)
   }
 
+  def approxSimilarityMulti[T, H](mh: MinHasher[H], sets: Seq[Set[T]]) = {
+    val sigs = for {
+      s <- sets
+      sig = s.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
+    } yield sig
+    mh.similarityMulti(sigs: _*)
+  }
+
   "MinHasher32" should {
     "measure 0.5 similarity in 1024 bytes with < 0.1 error" in {
       test(new MinHasher32(0.5, 1024), 0.5, 0.1)
@@ -63,4 +96,30 @@ class MinHasherSpec extends WordSpec with Matchers {
       test(new MinHasher32(1.0, 1024), 1.0, 0.01)
     }
   }
+
+  "MinHasher32 multiset similarity with sets = 2" should {
+    // Repeating the above tests for multiset implementation (sets = 2)
+    "measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.5, 1024), sets = 2, samples = 10000, similarity = 0.5, epsilon = 0.1)
+    }
+    "measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.8, 1024), sets = 2, samples = 10000, similarity = 0.8, epsilon = 0.1)
+    }
+    "measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
+      testMulti(new MinHasher32(1.0, 1024), sets = 2, samples = 10000, similarity = 1.0, epsilon = 0.01)
+    }
+  }
+
+  "MinHasher32 multiset similarity with sets = 10" should {
+    // New tests for multiset similarity
+    "measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.5, 1024), sets = 10, samples = 10000, similarity = 0.5, epsilon = 0.1)
+    }
+    "measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
+      testMulti(new MinHasher32(0.8, 1024), sets = 10, samples = 10000, similarity = 0.8, epsilon = 0.1)
+    }
+    "measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
+      testMulti(new MinHasher32(1.0, 1024), sets = 10, samples = 10000, similarity = 1.0, epsilon = 0.01)
+    }
+  }
 }