-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_tiff_xml.py
280 lines (219 loc) · 10.5 KB
/
generate_tiff_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# coding: utf-8
"""
基于已审核的细胞图像文件名生成对应大图的 xml, xml 包含信息为对应大图里的已审核细胞位置
细胞图像包含其来源大图名称,如
# 1-p0.0000_markedAs_ASCH_2017-10-27-16_12_50_x9659_y28027_w616_h331.jpg
# 1-p0.0000_markedAs_CC_2018-03-27-23_18_27_x5675_y23431_w230_h207_4x.jpg
# 1-p0.0000_markedAs_ACTINO_2018-06-20_18_37_06_x34602_y10123_w145_h172_2x.jpg
# 1-p0.0000_2017-11-24-13_16_54_x6626_y35845_w150_h79_4x.jpg
# 1-p0.0001_2017-10-09-17_12_28_x19230_y29594_w370_h910_.jpg
# m1_2018-04-04-17_50_08_x11194_y33583_w163_h112.jpg
# m1_2018-06-20_18_37_06_x10880_y42947_w113_h122.jpg
"""
import json
import os
from copy import deepcopy
from concurrent.futures import ProcessPoolExecutor, as_completed
from constants import METADATA_FILE_PATH, AGC_CLASSES, CHECKED_CELL_XML_SAVE_PATH, ACCEPTED_OVERLAPPED_RATIO, TIFF_IMAGE_RESOURCE_PATH
from utils import FilesScanner, generate_checked_level_xml, cal_IOU, get_location_from_filename, generate_name_path_dict
if not os.path.exists(METADATA_FILE_PATH):
os.makedirs(METADATA_FILE_PATH, exist_ok=True)
if not os.path.exists(CHECKED_CELL_XML_SAVE_PATH):
os.makedirs(CHECKED_CELL_XML_SAVE_PATH, exist_ok=True)
def get_cell_image(path, ctype, parent_pathes):
"""
获取细胞文件路径
:param path: 细胞图像路径
:param ctype: 标注类别 MANUAL or AUTO
:param parent_pathes: 大图名称及对应路径字典
:return:
"""
# 检查本地有无细胞图像文件路径信息文件
# 如果存在,则直接读取
# 如果没有,通过 FileScanner 工具类获取并写入本地文件
image_path_info_dict_path = ctype + '_IMAGES_PATH_DICT.txt'
check_name = os.path.join(METADATA_FILE_PATH, image_path_info_dict_path)
if os.path.exists(check_name):
with open(os.path.join(METADATA_FILE_PATH, image_path_info_dict_path)) as f:
files = [item.replace('\n', '') for item in f.readlines()]
else:
files = FilesScanner(path, ['.jpg']).get_files()
with open(os.path.join(METADATA_FILE_PATH, image_path_info_dict_path), 'w') as o:
o.writelines([item + '\n' for item in files])
# 根据细胞图像文件名生成细胞坐标信息
cells_dict = {}
for item in files:
if item.endswith('.jpg'):
# 细胞图文件名
basename = os.path.basename(item)
parent = os.path.dirname(item)
# 细胞所属类别
clas_type = os.path.basename(parent)
parent = os.path.dirname(parent)
# 细胞所属大图名称
parent_name = os.path.basename(parent)
parent = os.path.dirname(parent)
# 大图所属类别
parent_type = os.path.basename(parent)
# 大图原始路径
try:
parent_path = parent_pathes[parent_name]
except Exception as e:
print("CANNOT FIND RELATIVE TIFF PATH INFO, %s" % str(e))
exit()
# 解析坐标信息
point = get_location_from_filename(basename)
assert point, "THIS JPG NAME IS NOT ACCEPTED => %s" % basename
_, x, y, w, h, _ = point
# 修正 AGC 细胞类别
if clas_type in AGC_CLASSES:
clas_type = 'AGC'
# 解析与修正大图分类
if '_' in parent_type:
parent_type = parent_type.split('_')[-1]
if parent_type in AGC_CLASSES:
parent_type = 'AGC'
# 细胞位置及类别信息
info = {
'name': basename,
'cell_type': clas_type,
'cell_path': item,
'parent': parent_name,
'parent_full_name': os.path.basename(parent_path),
'parent_type': parent_type,
'x': x,
'y': y,
'w': w,
'h': h,
}
if parent_name in cells_dict:
cells_dict[parent_name].append(info)
else:
cells_dict[parent_name] = [info]
# 将解析细胞数据按归属大图名称写入文件
for key, lines in cells_dict.items():
# 生成输出路径
save_path = os.path.join(METADATA_FILE_PATH, ctype + '_IMAGES_PATH_DICT')
os.makedirs(save_path, exist_ok=True)
with open(os.path.join(save_path, key + '.txt'), 'w') as f:
for line in lines:
f.write(json.dumps(line) + '\n')
return cells_dict
def generate_xml_file(points_collection):
# 统计当前数据生成进度
count = 0
# 待处理文件总数
size = len(points_collection)
# 遍历拷贝大图文件及生成xml文件
for key, value in points_collection.items():
parent_name = value[0]['parent']
parent_file_name = value[0]['parent_full_name']
print("\nProcessing %s / %s %s" % (count + 1, size, parent_file_name))
# TIFF 文件本地存储路径
save_path = os.path.join(TIFF_IMAGE_RESOURCE_PATH, parent_file_name)
# 当本地文件不存在时从远程服务器下载该文件
if not os.path.exists(save_path):
pass
# shutil.copy(parent_path, save_path)
else:
pass
# 生成大图对应 细胞标注点 xml 文件
xml_file_name = os.path.join(CHECKED_CELL_XML_SAVE_PATH, parent_name + '.xml')
print('GENERATE XML %s' % xml_file_name)
if len(value) > 0:
generate_checked_level_xml(xml_file_name, value)
count += 1
if __name__ == '__main__':
# 大图存储位置
tif_path = [
'/run/user/1000/gvfs/smb-share:server=192.168.2.221,share=data_samba/DATA/checked_cells/manual_labelled_checked/label_kfb_tif/label_data',
'/run/user/1000/gvfs/smb-share:server=192.168.2.221,share=data_samba/DATA/checked_cells/manual_labelled_checked/label_kfb_tif/stage2_labelimg',
'/run/user/1000/gvfs/smb-share:server=192.168.2.221,share=data_samba/DATA/checked_cells/manual_labelled_checked/label_kfb_tif/stage2_online',
]
# 自动标注细胞图像存储位置
auto_path = '/run/user/1000/gvfs/smb-share:server=192.168.2.221,share=data_samba/DATA/checked_cells/auto_cellCutting_checked/Drzhuchecked'
# 手工标注细胞图像存储位置
manual_path = '/run/user/1000/gvfs/smb-share:server=192.168.2.221,share=data_samba/DATA/checked_cells/manual_labelled_checked/renamed_manual_checked_cells'
# 1. 检查大图 名称与路径对应关系 txt 文件是否存在, 生成生成大图文件名与路径 dict
tif_images_collections_path = os.path.join(METADATA_FILE_PATH, 'TIFF_IMAGES_PATH_DICT.txt')
if not os.path.exists(tif_images_collections_path):
print('GENERATE TIFF IMAGES FILES PATH COLLECTIONS FILE...')
tif_images = generate_name_path_dict(tif_path, ['.tif', '.kfb'], tif_images_collections_path)
else:
print('LOAD TIFF IMAGES FILES PATH COLLECTIONS...')
tif_images = {}
with open(tif_images_collections_path) as f:
lines = f.readlines()
for line in lines:
name, path = line.replace('\n', '').split('\t')
if name in tif_images:
print('ERROR')
else:
tif_images[name] = path
if not tif_images:
print("NO TIFF FILES FOUND!")
exit()
else:
print("GOT %s TIFF FILES" % len(tif_images))
# 2. 生成大图与细胞位置信息文件路径字典
# 人工标注细胞信息
print('LOAD MANUAL IMAGES INFO COLLECTIONS...')
manual_images = get_cell_image(manual_path, 'MANUAL', tif_images)
# 自动标注细胞信息
print('LOAD AUTO IMAGES INFO COLLECTIONS...')
auto_images = get_cell_image(auto_path, 'AUTO', tif_images)
# 用于判断同一大图是否在手工标注和自动标注文件夹中同时存在
# rets[0]-不存在标注信息;rets[1]-存在 手工/自动 一种标注信息;rets[2]-存在 手工+自动 两种标注信息
rets = [0, 0, 0]
# 相似细胞个数统计
similar_count = 0
# 所有大图的文件名列表
tif_names = list(tif_images.keys())
# 3. 大图与图内细胞信息字典
print('DROP CELL WHICH IN AUTO AND MANUAL AND OVERLAP RATION OVER 0.5 ...')
points_collection = {}
for name in tif_names:
# 获取自动标注细胞信息文件路径
auto = auto_images.get(name, None)
# 获取手工标注细胞信息文件路径
manual = manual_images.get(name, None)
# 表示大图在手工标注和自动标注均不存在
n = 0
# 表示大图在自动标注中存在
n += 1 if auto else 0
# 表示大图在手工标注中存在
n += 1 if manual else 0
# 计数
rets[n] += 1
# 当两种标注数据同时存在时,进行去重处理
# 以手工标注信息为准,对自动标注信息在手工标注信息中进行遍历
# 若存在重合度 ratio > 0.5 的标注信息,直接丢弃;否则加入细胞信息列表
if n == 2:
# 以手工标注信息为基准
# TO-DO 过滤 重叠率 > 0.5 图像
available_points = deepcopy(manual)
# 对自动标注信息进行遍历,存在手工标注信息中存在 重合度 > 0.5 的细胞信息,直接丢弃
for item_ in auto:
x_, y_, w_, h_ = item_['x'], item_['y'], item_['w'], item_['h']
for item in manual:
x, y, w, h = item['x'], item['y'], item['w'], item['h']
ratio = cal_IOU((x, y, w, h), (x_, y_, w_, h_))
if ratio > ACCEPTED_OVERLAPPED_RATIO:
similar_count += 1
break
else:
available_points.append(item_)
points_collection[name] = available_points
elif n == 1:
if auto:
points_collection[name] = auto
if manual:
points_collection[name] = manual
print('\n大图文件总数:\t%s' % len(tif_names))
print('未匹配到标注信息的大图数量:\t%s' % rets[0])
print('匹配到一种标注信息的大图数量:\t%s' % rets[1])
print('匹配到两种标注信息的大图数量:\t%s' % rets[2])
print('重合度 大于0.5 的细胞数量:\t%s' % similar_count)
print('已标注大图数量:\t%s\n' % len(points_collection))
# 4. 拷贝文件,生成 xml 文件
generate_xml_file(points_collection)