-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathall-maximal-sets-satelite.cc
158 lines (141 loc) · 4.47 KB
/
all-maximal-sets-satelite.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ---
// An algorithm for finding all maximal sets based on the approach
// used by SateLite (minus the bloom filters).
// ---
// Author: Roberto Bayardo
#include "all-maximal-sets-satelite.h"
#include <assert.h>
#include <algorithm>
#include <iostream>
#include <vector>
#include "set-properties.h"
#include "data-source-iterator.h"
namespace google_extremal_sets {
namespace {
// Returns true if set2 properly subsumes set1.
inline bool IsSubsumedBy(const SetProperties& set1, const SetProperties& set2) {
if (set1.size >= set2.size)
return false;
const uint32_t* it1 = set1.begin();
const uint32_t* it2 = set2.begin();
const uint32_t* it1_end = set1.end();
const uint32_t* it2_end = set2.end();
while (it1 != it1_end) {
it2 = std::lower_bound(it2, it2_end, *it1);
if (it2 == it2_end || *it2 > *it1)
return false;
++it1;
++it2;
}
return true;
}
// Used to free up all itemset resources when it goes out of scope.
class CleanerUpper {
public:
CleanerUpper(OccursList* all_sets) :
all_sets_(all_sets) {}
~CleanerUpper() {
for (unsigned int i = 0; i < all_sets_->size(); ++i) {
SetProperties::Delete((*all_sets_)[i]);
}
all_sets_->clear();
}
private:
OccursList* all_sets_;
};
} // namespace
bool AllMaximalSetsSateLite::FindAllMaximalSets(
DataSourceIterator* data,
uint32_t max_item_id,
uint32_t max_items_in_ram,
OutputModeEnum output_mode) {
Init();
// Vars set by the data source iterator.
int result;
uint32_t set_id;
std::vector<uint32_t> current_set;
if (!PrepareForDataScan(data, max_item_id, 0))
return false; // IO error
uint32_t items_in_ram = 0;
CleanerUpper cleanup(&all_sets_);
// This loop scans the input data from beginning to end and indexes
// each candidate on the occurrs_ lists.
while ((result = data->Next(&set_id, ¤t_set)) > 0) {
SetProperties* index_me = SetProperties::Create(set_id, current_set);
items_in_ram += current_set.size();
all_sets_.push_back(index_me);
if (items_in_ram >= max_items_in_ram) {
std::cerr << "; ERROR: max_items_in_ram exceeded." << std::endl;
return false;
}
++input_sets_count_;
for (unsigned int i = 0; i < index_me->size; ++i) {
occurs_[index_me->item[i]].push_back(index_me);
}
}
if (result != 0)
return false; // IO error
std::cerr << "; Starting subsumption checking scan." << std::endl;
for (unsigned int i = 0; i < all_sets_.size(); ++i) {
if (!IsSubsumed(*all_sets_[i]))
FoundMaximalSet(*(all_sets_[i]), output_mode);
}
return true;
}
void AllMaximalSetsSateLite::Init() {
maximal_sets_count_ = input_sets_count_ = subsumption_checks_count_ = 0;
}
bool AllMaximalSetsSateLite::PrepareForDataScan(
DataSourceIterator* data, uint32_t max_item_id, off_t resume_offset) {
occurs_.clear();
occurs_.resize(max_item_id);
std::cerr << "; Starting new dataset scan at offset: "
<< resume_offset << std::endl;
return data->Seek(resume_offset);
}
bool AllMaximalSetsSateLite::IsSubsumed(const SetProperties& candidate) {
const OccursList& occurs = occurs_[candidate[0]];
for (unsigned int j = 0; j < occurs.size(); ++j) {
SetProperties* check_me = occurs[j];
++subsumption_checks_count_;
if (IsSubsumedBy(candidate, *check_me))
return true;
}
return false;
}
void AllMaximalSetsSateLite::FoundMaximalSet(
const SetProperties& maximal_set, OutputModeEnum output_mode) {
++maximal_sets_count_;
switch (output_mode) {
case COUNT_ONLY:
break;
case ID:
std::cout << maximal_set.set_id << '\n';
break;
case ID_AND_ITEMS:
std::cout << maximal_set.set_id << ":";
for (unsigned int i = 0; i < maximal_set.size; ++i) {
std::cout << ' ' << maximal_set.item[i];
}
std::cout << '\n';
break;
default:
std::cout << "Huh?\n";
assert(0);
break;
}
}
} // namespace google_extremal_sets