1
1
import os
2
+ import hashlib
3
+
4
+ def calculate_file_hash (file_path ):
5
+ """计算文件的SHA-256哈希值"""
6
+ hash_sha256 = hashlib .sha256 ()
7
+ with open (file_path , 'rb' ) as f :
8
+ for chunk in iter (lambda : f .read (4096 ), b"" ):
9
+ hash_sha256 .update (chunk )
10
+ return hash_sha256 .hexdigest ()
2
11
3
12
def count_files_in_categories (base_dir ):
4
13
category_counts = {}
14
+ unique_hashes = set ()
5
15
6
16
# 遍历输出目录中的所有分类目录
7
17
for category in os .listdir (base_dir ):
8
18
category_path = os .path .join (base_dir , category )
9
19
if os .path .isdir (category_path ):
10
- # 统计分类目录中的文件数量
11
- file_count = len ([f for f in os .listdir (category_path ) if os .path .isfile (os .path .join (category_path , f ))])
20
+ file_count = 0
21
+
22
+ # 遍历分类目录中的文件
23
+ for f in os .listdir (category_path ):
24
+ file_path = os .path .join (category_path , f )
25
+ if os .path .isfile (file_path ):
26
+ file_count += 1
27
+ # 计算文件哈希并加入集合(用于去重)
28
+ file_hash = calculate_file_hash (file_path )
29
+ unique_hashes .add (file_hash )
30
+
12
31
category_counts [category ] = file_count
13
32
14
- return category_counts
33
+ return category_counts , unique_hashes
15
34
16
35
# 输出目录路径
17
36
output_path = "poc"
18
37
19
38
# 调用统计函数
20
- category_counts = count_files_in_categories (output_path )
39
+ category_counts , unique_hashes = count_files_in_categories (output_path )
21
40
22
41
# 打印统计结果
23
42
print ("各类文件数量:" )
@@ -27,3 +46,7 @@ def count_files_in_categories(base_dir):
27
46
total_files += count
28
47
29
48
print (f"总文件数量: { total_files } " )
49
+
50
+ # 打印去重后文件数量
51
+ unique_file_count = len (unique_hashes )
52
+ print (f"去重后文件数量: { unique_file_count } " )
0 commit comments