Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalized similarity calculation for minhasher #654

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions algebird-core/src/main/scala/com/twitter/algebird/MinHasher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,44 @@ object MinHasher {
(hashes, bands)
}

// Providing an extension method on MinHasher[H] to preserve binary compatibility
implicit class similarityMultiExtender[H](val h: MinHasher[H]) extends AnyVal {
/**
* Generalized Jaccard similarity estimation for multiple sets (size of intersection / size of union).
* Jsim(S1..Sn) = P(hmin1 == hmin2 == ... == hminn) / numHashes
*/
def similarityMulti(sigs: MinHashSignature*): Double =
h match {
case h32: MinHasher32 => MinHasher32.similarityMulti(h32, sigs: _*)
case h16: MinHasher16 => MinHasher16.similarityMulti(h16, sigs: _*)
}
}
}

object MinHasher32 {
private def buildArrayMulti(h: MinHasher32, hasherbuffers: Seq[Array[Byte]])(fn: Seq[Int] => Int): Array[Byte] = {
val intBuffers = hasherbuffers.map(b => ByteBuffer.wrap(b).asIntBuffer)
h.buildArray(fn(intBuffers.map(_.get).toVector))
}

def similarityMulti(h: MinHasher32, sigs: MinHashSignature*)(implicit n: Numeric[Int]): Double = {
buildArrayMulti(h, sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero)
.map(_.toDouble)
.sum / h.numHashes
}
}

object MinHasher16 {
private def buildArrayMulti(h: MinHasher16, hasherbuffers: Seq[Array[Byte]])(fn: Seq[Char] => Char): Array[Byte] = {
val charBuffers = hasherbuffers.map(b => ByteBuffer.wrap(b).asCharBuffer)
h.buildArray(fn(charBuffers.map(_.get).toVector))
}

def similarityMulti(h: MinHasher16, sigs: MinHashSignature*)(implicit n: Numeric[Char]): Double = {
buildArrayMulti(h, sigs.map(_.bytes))(vals => if (vals.forall(_ == vals.head)) n.one else n.zero)
.map(_.toDouble)
.sum / h.numHashes
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ class MinHasherSpec extends WordSpec with Matchers {
assert(error < epsilon)
}

def testMulti[H](mh: MinHasher[H], sets: Int, samples: Int, similarity: Double, epsilon: Double) = {
val randomSets = randomMultiSets(sets, samples, similarity)

val exact = exactSimilarityMulti(randomSets)
val sim = approxSimilarityMulti(mh, randomSets)
val error: Double = math.abs(exact - sim)
assert(error < epsilon)
info(s"sets: $sets, exact: $exact, sim: $sim, error: $error, epsion: $epsilon")
}

def randomSets(similarity: Double) = {
val s = 10000
val uniqueFraction = if (similarity == 1.0) 0.0 else (1 - similarity) / (1 + similarity)
Expand All @@ -42,16 +52,39 @@ class MinHasherSpec extends WordSpec with Matchers {
(unique1 ++ shared, unique2 ++ shared)
}

def randomMultiSets(sets: Int, samples: Int, similarity: Double) = {
val sharedSamples = similarity * samples
val uniqueSamples = samples - sharedSamples

val shared = 1.to(sharedSamples.toInt).map{ i => math.random }.toSet
for {
i <- 1 to sets
unique = 1.to((uniqueSamples / sets).toInt).map{ i => math.random }.toSet
} yield unique ++ shared
}

def exactSimilarity[T](x: Set[T], y: Set[T]) = {
(x & y).size.toDouble / (x ++ y).size
}

def exactSimilarityMulti[T](sets: Seq[Set[T]]) = {
sets.reduce(_ & _).size.toDouble / sets.reduce(_ | _).size.toDouble
}

def approxSimilarity[T, H](mh: MinHasher[H], x: Set[T], y: Set[T]) = {
val sig1 = x.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
val sig2 = y.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
mh.similarity(sig1, sig2)
}

def approxSimilarityMulti[T, H](mh: MinHasher[H], sets: Seq[Set[T]]) = {
val sigs = for {
s <- sets
sig = s.map{ l => mh.init(l.toString) }.reduce{ (a, b) => mh.plus(a, b) }
} yield sig
mh.similarityMulti(sigs: _*)
}

"MinHasher32" should {
"measure 0.5 similarity in 1024 bytes with < 0.1 error" in {
test(new MinHasher32(0.5, 1024), 0.5, 0.1)
Expand All @@ -63,4 +96,30 @@ class MinHasherSpec extends WordSpec with Matchers {
test(new MinHasher32(1.0, 1024), 1.0, 0.01)
}
}

"MinHasher32 multiset similarity with sets = 2" should {
// Repeating the above tests for multiset implementation (sets = 2)
"measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.5, 1024), sets = 2, samples = 10000, similarity = 0.5, epsilon = 0.1)
}
"measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.8, 1024), sets = 2, samples = 10000, similarity = 0.8, epsilon = 0.1)
}
"measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
testMulti(new MinHasher32(1.0, 1024), sets = 2, samples = 10000, similarity = 1.0, epsilon = 0.01)
}
}

"MinHasher32 multiset similarity with sets = 10" should {
// New tests for multiset similarity
"measure 0.5 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.5, 1024), sets = 10, samples = 10000, similarity = 0.5, epsilon = 0.1)
}
"measure 0.8 multiset similarity in 1024 bytes with < 0.1 error" in {
testMulti(new MinHasher32(0.8, 1024), sets = 10, samples = 10000, similarity = 0.8, epsilon = 0.1)
}
"measure 1.0 multiset similarity in 1024 bytes with < 0.01 error" in {
testMulti(new MinHasher32(1.0, 1024), sets = 10, samples = 10000, similarity = 1.0, epsilon = 0.01)
}
}
}