Skip to content

Commit ff7ead5

Browse files
authored
Update 6-get_count.py
1 parent 0ca0092 commit ff7ead5

File tree

1 file changed

+27
-4
lines changed

1 file changed

+27
-4
lines changed

6-get_count.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,42 @@
11
import os
2+
import hashlib
3+
4+
def calculate_file_hash(file_path):
5+
"""计算文件的SHA-256哈希值"""
6+
hash_sha256 = hashlib.sha256()
7+
with open(file_path, 'rb') as f:
8+
for chunk in iter(lambda: f.read(4096), b""):
9+
hash_sha256.update(chunk)
10+
return hash_sha256.hexdigest()
211

312
def count_files_in_categories(base_dir):
413
category_counts = {}
14+
unique_hashes = set()
515

616
# 遍历输出目录中的所有分类目录
717
for category in os.listdir(base_dir):
818
category_path = os.path.join(base_dir, category)
919
if os.path.isdir(category_path):
10-
# 统计分类目录中的文件数量
11-
file_count = len([f for f in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, f))])
20+
file_count = 0
21+
22+
# 遍历分类目录中的文件
23+
for f in os.listdir(category_path):
24+
file_path = os.path.join(category_path, f)
25+
if os.path.isfile(file_path):
26+
file_count += 1
27+
# 计算文件哈希并加入集合(用于去重)
28+
file_hash = calculate_file_hash(file_path)
29+
unique_hashes.add(file_hash)
30+
1231
category_counts[category] = file_count
1332

14-
return category_counts
33+
return category_counts, unique_hashes
1534

1635
# 输出目录路径
1736
output_path = "poc"
1837

1938
# 调用统计函数
20-
category_counts = count_files_in_categories(output_path)
39+
category_counts, unique_hashes = count_files_in_categories(output_path)
2140

2241
# 打印统计结果
2342
print("各类文件数量:")
@@ -27,3 +46,7 @@ def count_files_in_categories(base_dir):
2746
total_files += count
2847

2948
print(f"总文件数量: {total_files}")
49+
50+
# 打印去重后文件数量
51+
unique_file_count = len(unique_hashes)
52+
print(f"去重后文件数量: {unique_file_count}")

0 commit comments

Comments
 (0)