forked from Irish-Film-Institute/IFIscripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
walk_to_dfxml.py
executable file
·205 lines (176 loc) · 6.99 KB
/
walk_to_dfxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python3
# This software was developed at the National Institute of Standards
# and Technology by employees of the Federal Government in the course
# of their official duties. Pursuant to title 17 Section 105 of the
# United States Code this software is not subject to copyright
# protection and is in the public domain. NIST assumes no
# responsibility whatsoever for its use by other parties, and makes
# no guarantees, expressed or implied, about its quality,
# reliability, or any other characteristic.
#
# We would appreciate acknowledgement if the software is used.
"""Walk current directory, writing DFXML to stdout."""
__version__ = "0.3.0"
import os
import stat
import hashlib
import argparse
import traceback
import logging
import sys
_logger = logging.getLogger(os.path.basename(__file__))
import Objects
def filepath_to_fileobject(filepath, args):
fobj = Objects.FileObject()
#Determine type - done in three steps.
if os.path.islink(filepath):
fobj.name_type = "l"
elif os.path.isdir(filepath):
fobj.name_type = "d"
elif os.path.isfile(filepath):
fobj.name_type = "r"
else:
#Need to finish type determinations with stat structure.
pass
#Prime fileobjects from Stat data (lstat for soft links).
if fobj.name_type == "l":
sobj = os.lstat(filepath)
else:
sobj = os.stat(filepath)
#_logger.debug(sobj)
fobj.populate_from_stat(sobj)
if fobj.name_type is None:
if stat.S_ISCHR(fobj.mode):
fobj.name_type = "c"
elif stat.S_ISBLK(fobj.mode):
fobj.name_type = "b"
elif stat.S_ISFIFO(fobj.mode):
fobj.name_type = "p"
elif stat.S_ISSOCK(fobj.mode):
fobj.name_type = "s"
elif stat.S_ISWHT(fobj.mode):
fobj.name_type = "w"
else:
raise NotImplementedError("No reporting check written for file type of %r." % filepath)
#Hard-coded information: Name, and assumed allocation status.
fobj.filename = filepath
fobj.alloc = True
if fobj.name_type == "l":
fobj.link_target = os.readlink(filepath)
if not args.n:
#Add hashes for regular files.
if fobj.name_type == "r":
try:
with open(filepath, "rb") as in_fh:
chunk_size = 2**22
md5obj = hashlib.md5()
sha512obj = hashlib.sha512()
any_error = False
while True:
buf = b""
try:
buf = in_fh.read(chunk_size)
except Exception as e:
any_error = True
fobj.error = "".join(traceback.format_stack())
if e.args:
fobj.error += "\n" + str(e.args)
buf = b""
if buf == b"":
break
md5obj.update(buf)
sha512obj.update(buf)
if not any_error:
fobj.md5 = md5obj.hexdigest()
fobj.sha512 = sha512obj.hexdigest()
except Exception as e:
if fobj.error is None:
fobj.error = ""
else:
fobj.error += "\n"
fobj.error += "".join(traceback.format_stack())
if e.args:
fobj.error += "\n" + str(e.args)
return fobj
def main(args_):
#Determine whether we're going in threading mode or not. (Some modules are not available by default.)
args = parse_args(args_)
using_threading = False
if args.jobs > 1:
using_threading = True #(unless supporting modules are absent)
try:
import threading
except:
using_threading = False
_logger.warning("Threading support not available. Running in single thread only.")
try:
import queue
except:
using_threading = False
_logger.warning("Python queue support not available. (If running Ubuntu, this is in package python3-queuelib.) Running in single thread only.")
dobj = Objects.DFXMLObject(version="1.1.1")
dobj.program = sys.argv[0]
dobj.program_version = __version__
dobj.command_line = " ".join(sys.argv)
dobj.dc["type"] = "File system walk"
dobj.add_creator_library("Python", ".".join(map(str, sys.version_info[0:3]))) #A bit of a bend, but gets the major version information out.
dobj.add_creator_library("Objects.py", Objects.__version__)
dobj.add_creator_library("dfxml.py", Objects.dfxml.__version__)
filepaths = set()
filepaths.add(".")
for (dirpath, dirnames, filenames) in os.walk("."):
dirent_names = set()
for dirname in dirnames:
dirent_names.add(dirname)
for filename in filenames:
dirent_names.add(filename)
for dirent_name in sorted(dirent_names):
#The relpath wrapper removes "./" from the head of the path.
filepath = os.path.relpath(os.path.join(dirpath, dirent_name))
filepaths.add(filepath)
fileobjects_by_filepath = dict()
if using_threading:
#Threading syntax c/o: https://docs.python.org/3.5/library/queue.html
q = queue.Queue()
threads = []
def _worker():
while True:
filepath = q.get()
if filepath is None:
break
fobj = filepath_to_fileobject(filepath, args)
fileobjects_by_filepath[filepath] = fobj
q.task_done()
for i in range(args.jobs):
t = threading.Thread(target=_worker)
t.start()
threads.append(t)
for filepath in filepaths:
q.put(filepath)
# block until all tasks are done
q.join()
# stop workers
for i in range(args.jobs):
q.put(None)
for t in threads:
t.join()
else: #Not threading.
for filepath in sorted(filepaths):
fobj = filepath_to_fileobject(filepath, args)
fileobjects_by_filepath[filepath] = fobj
#Build output DFXML tree.
for filepath in sorted(fileobjects_by_filepath.keys()):
dobj.append(fileobjects_by_filepath[filepath])
return dobj.to_dfxml()
def parse_args(args_):
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--debug", action="store_true")
parser.add_argument("-n", action="store_true", help="Do not calculate any hashes")
parser.add_argument("-j", "--jobs", type=int, default=1, help="Number of file-processing threads to run.")
args = parser.parse_args(args_)
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
if args.jobs <= 0:
raise ValueError("If requesting multiple jobs, please request 1 or more worker threads.")
return args
if __name__ == "__main__":
main(sys.argv[1:])