-
Notifications
You must be signed in to change notification settings - Fork 218
/
Copy pathregexp_sifter.py
43 lines (37 loc) · 1.38 KB
/
regexp_sifter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import sys, re
# parse options
options = dict(zip(sys.argv[1::2],sys.argv[2::2]))
# read entry
doc = data = sys.stdin.read()
# Apply a sequence of patterns which turn a normalized Atom entry into
# a stream of text, after removal of non-human metadata.
for pattern,replacement in [
(re.compile('<id>.*?</id>'),' '),
(re.compile('<url>.*?</url>'),' '),
(re.compile('<source>.*?</source>'),' '),
(re.compile('<updated.*?</updated>'),' '),
(re.compile('<published.*?</published>'),' '),
(re.compile('<link [^>]*>'),' '),
(re.compile('''<[^>]* alt=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* title=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* label=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* term=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('<[^>]+>'),' '),
(re.compile('>'),'>'),
(re.compile('<'),'<'),
(re.compile('''),"'"),
(re.compile('"'),'"'),
(re.compile('&'),'&'),
(re.compile('\s+'),' ')
]:
data=pattern.sub(replacement,data)
# process requirements
if options.has_key('--require'):
for regexp in options['--require'].split('\n'):
if regexp and not re.search(regexp,data): sys.exit(1)
# process exclusions
if options.has_key('--exclude'):
for regexp in options['--exclude'].split('\n'):
if regexp and re.search(regexp,data): sys.exit(1)
# if we get this far, the feed is to be included
print doc