first commit

yiylin · Sep 12, 2015 · f1a84ee · f1a84ee
1 parent 7725e55
commit f1a84ee
Show file tree

Hide file tree

Showing 31 changed files with 3,636 additions and 2 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,23 @@
+The MIT License (MIT)
+
+Copyright (c) Microsoft Corporation
+
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,54 @@
+PROJECT := $(shell readlink $(dir $(lastword $(MAKEFILE_LIST))) -f)
+
+CXX = g++
+CXXFLAGS = -O3 \
+           -std=c++11 \
+           -Wall \
+           -Wno-sign-compare \
+           -fno-omit-frame-pointer
+
+MULTIVERSO_DIR = $(PROJECT)/multiverso
+MULTIVERSO_INC = $(MULTIVERSO_DIR)/include
+MULTIVERSO_LIB = $(MULTIVERSO_DIR)/lib
+THIRD_PARTY_LIB = $(MULTIVERSO_DIR)/third_party/lib
+
+INC_FLAGS = -I$(MULTIVERSO_INC)
+LD_FLAGS  = -L$(MULTIVERSO_LIB) -lmultiverso
+LD_FLAGS += -L$(THIRD_PARTY_LIB) -lzmq -lmpi -lmpl
+
+LIGHTLDA_HEADERS = $(shell find $(PROJECT)/src -type f -name "*.h")
+LIGHTLDA_SRC     = $(shell find $(PROJECT)/src -type f -name "*.cpp")
+LIGHTLDA_OBJ = $(LIGHTLDA_SRC:.cpp=.o)
+
+DUMP_BINARY_SRC = $(shell find $(PROJECT)/preprocess -type f -name "*.cpp")
+
+BIN_DIR = $(PROJECT)/bin
+LIGHTLDA = $(BIN_DIR)/lightlda
+DUMP_BINARY = $(BIN_DIR)/dump_binary
+
+all: path \
+	 lightlda \
+	 dump_binary
+
+path: $(BIN_DIR)
+
+$(BIN_DIR):
+	mkdir -p $@
+
+$(LIGHTLDA): $(LIGHTLDA_OBJ)
+	$(CXX) $(LIGHTLDA_OBJ) $(CXXFLAGS) $(INC_FLAGS) $(LD_FLAGS) -o $@
+
+$(LIGHTLDA_OBJ): %.o: %.cpp $(LIGHTLDA_HEADERS) $(MULTIVERSO_INC)
+	$(CXX) $(CXXFLAGS) $(INC_FLAGS) -c $< -o $@
+
+$(DUMP_BINARY): $(DUMP_BINARY_SRC)
+	$(CXX) $(CXXFLAGS) $< -o $@
+
+lightlda: path $(LIGHTLDA)
+
+dump_binary: path $(DUMP_BINARY)
+
+clean:
+	rm -rf $(BIN_DIR) $(LIGHTLDA_OBJ)
+
+.PHONY: all path lightlda dump_binary clean
diff --git a/README.md b/README.md
@@ -1,2 +1,33 @@
-# lightlda
-Scalable, fast, and lightweight system for large-scale topic modeling
+#LightLDA
+
+LightLDA is a distributed system for large scale topic modeling. It implements a distributed sampler that enables very large data sizes and models. LightLDA improves sampling throughput and convergence speed via a fast O(1) metropolis-Hastings algorithm, and allows small cluster to tackle very large data and model sizes through model scheduling and data parallelism architecture. LightLDA is implemented with C++ for performance consideration.
+
+We have sucessfully trained big topic models (with trillions of parameters) on big data (Top 10% PageRank values of Bing indexed page, containing billions of documents) in Microsoft. For more technical details, please refer to our [WWW'15 paper](http://www.www2015.it/documents/proceedings/proceedings/p1351.pdf).
+
+##Why LightLDA
+
+The highlight features of LightLDA are
+
+* **Scalable**: LightLDA can train models with trillions of parameters on big data with billions of documents, a scale previous implementations cann't handle. 
+* **Fast**: The sampler can sample millions of tokens per second per multi-core node.
+* **Lightweight**: Such big tasks can be trained with as few as tens of machines.
+
+##Quick Start
+
+Run ``` $./build.sh``` to build lightlda.
+
+
+##Reference
+
+Please cite LightLDA if it helps in your research:
+
+```
+@inproceedings{yuan2015lightlda,
+  title={LightLDA: Big Topic Models on Modest Computer Clusters},
+  author={Yuan, Jinhui and Gao, Fei and Ho, Qirong and Dai, Wei and Wei, Jinliang and Zheng, Xun and Xing, Eric Po and Liu, Tie-Yan and Ma, Wei-Ying},
+  booktitle={Proceedings of the 24th International Conference on World Wide Web},
+  pages={1351--1361},
+  year={2015},
+  organization={International World Wide Web Conferences Steering Committee}
+}
+```
diff --git a/build.sh b/build.sh
@@ -0,0 +1,12 @@
+# build lightlda
+
+git clone https://github.com/msraai/multiverso
+
+cd multiverso
+cd third_party
+sh install.sh
+cd ..
+make -j4 all
+
+cd ..
+make -j4
diff --git a/example/nytimes.sh b/example/nytimes.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+root=`pwd`
+echo $root
+bin=$root/../bin
+dir=$root/data/nytimes
+
+mkdir -p $dir
+cd $dir
+
+# 1. Download the data
+wget https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz
+gunzip $dir/docword.nytimes.txt.gz
+wget https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.nytimes.txt
+
+# 2. UCI format to libsvm format
+python $root/text2libsvm.py $dir/docword.nytimes.txt $dir/vocab.nytimes.txt $dir/nytimes.libsvm $dir/nytimes.word_id.dict
+
+# 3. libsvm format to binary format
+$bin/dump_binary $dir/nytimes.libsvm $dir/nytimes.word_id.dict $dir 0
+
+# 4. Run LightLDA
+$bin/lightlda -num_vocabs 111400 -num_topics 1000 -num_iterations 100 -alpha 0.1 -beta 0.01 -mh_steps 4 -num_local_workers 1 -num_blocks 1 -max_num_document 300000 -input_dir $dir -data_capacity 800
diff --git a/example/pubmed.sh b/example/pubmed.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+root=`pwd`
+echo $root
+bin=$root/../bin
+dir=$root/data/pubmed
+
+mkdir -p $dir
+cd $dir
+
+# 1. Download the data
+wget https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.pubmed.txt.gz
+gunzip $dir/docword.pubmed.txt.gz
+wget https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.pubmed.txt
+
+# 2. UCI format to libsvm format
+python $root/text2libsvm.py $dir/docword.pubmed.txt $dir/vocab.pubmed.txt $dir/pubmed.libsvm $dir/pubmed.word_id.dict
+
+# 3. libsvm format to binary format
+$bin/dump_binary $dir/pubmed.libsvm $dir/pubmed.word_id.dict $dir 0
+
+# 4. Run LightLDA
+$bin/lightlda -num_vocabs 144400 -num_topics 1000 -num_iterations 100 -alpha 0.1 -beta 0.01 -mh_steps 4 -num_local_workers 1 -num_blocks 1 -max_num_document 8300000 -input_dir $dir -data_capacity 6200
diff --git a/example/text2libsvm.py b/example/text2libsvm.py
@@ -0,0 +1,58 @@
+"""
+This script is for converting UCI format docword and vocab file to libsvm format data and dict
+
+(How to run)
+
+python text2libsvm.py <docword.input> <vocab.input> <libsvm.output> <dict.output>
+
+"""
+
+import sys
+
+if len(sys.argv) != 5:
+    print "Usage: python text2libsvm.py <docword.input> <vocab.input> <libsvm.output> <dict.output>"
+    exit(1)
+
+data_file = open(sys.argv[1], 'r')
+vocab_file = open(sys.argv[2], 'r')
+
+libsvm_file = open(sys.argv[3], 'w')
+dict_file = open(sys.argv[4], 'w')
+
+word_dict = {}
+vocab_dict = []
+doc = ""
+last_doc_id = 0
+
+line = vocab_file.readline()
+while line:
+    vocab_dict.append(line.strip())
+    line = vocab_file.readline()
+
+line = data_file.readline()
+while line:
+    col = line.strip().split(' ')
+    if len(col) == 3:
+        doc_id = int(col[0])
+        word_id = int(col[1]) - 1
+        word_count = int(col[2])
+        if not word_dict.has_key(word_id):
+            word_dict[word_id] = 0
+        word_dict[word_id] += word_count
+        if doc_id != last_doc_id:
+            if doc != "":
+                libsvm_file.write(doc.strip() + '\n')
+            doc = str(doc_id) + '\t'
+        doc += str(word_id) + ':' + str(word_count) + ' '
+        last_doc_id = doc_id
+    line = data_file.readline()
+
+if doc != "":
+    libsvm_file.write(doc.strip() + '\n')
+
+libsvm_file.close()
+
+for word in word_dict:
+    line = '\t'.join([str(word), vocab_dict[word], str(word_dict[word])]) + '\n'
+    dict_file.write(line)
+