4
4
except ImportError :
5
5
from HTMLParser import HTMLParser
6
6
from htmlentitydefs import name2codepoint
7
+ import re
7
8
8
9
LINEBR = "::LINEBR::"
9
10
@@ -23,6 +24,9 @@ def __init__(self, html, *args, **kwargs):
23
24
except TypeError :
24
25
HTMLParser .__init__ (self , * args , ** kwargs )
25
26
self .skip = False
27
+ self .isProcessingList = False
28
+ self .isProcessingOrderedList = False
29
+ self .orderedNumber = 0
26
30
27
31
# slackified string
28
32
self .output = ''
@@ -43,9 +47,11 @@ def handle_starttag(self, tag, attrs):
43
47
if tag == 'br' or tag == 'p' :
44
48
self .output += LINEBR
45
49
if tag == 'b' or tag == 'strong' :
46
- self .output += '*'
50
+ self .output += ' *'
51
+ if re .match ("h[1-6]{1}" , tag ):
52
+ self .output += ' *'
47
53
if tag == 'i' or tag == 'em' :
48
- self .output += '_'
54
+ self .output += ' _'
49
55
if tag == 'code' :
50
56
self .output += '`'
51
57
if tag == 'a' :
@@ -55,6 +61,16 @@ def handle_starttag(self, tag, attrs):
55
61
self .output += attr [1 ] + '|'
56
62
if tag == 'style' or tag == 'script' :
57
63
self .skip = True
64
+ if tag == 'ul' :
65
+ self .isProcessingList = True
66
+ if tag == 'li' and self .isProcessingList :
67
+ self .output += '• '
68
+ if tag == 'ol' :
69
+ self .orderedNumber = 1
70
+ self .isProcessingOrderedList = True
71
+ if tag == 'li' and self .isProcessingOrderedList :
72
+ self .output += '{}. ' .format (self .orderedNumber )
73
+ self .orderedNumber = self .orderedNumber + 1
58
74
59
75
def handle_endtag (self , tag ):
60
76
"""
@@ -63,15 +79,25 @@ def handle_endtag(self, tag):
63
79
:return:
64
80
"""
65
81
if tag == 'b' or tag == 'strong' :
66
- self .output += '*'
82
+ self .output += '* '
83
+ if re .match ("h[1-6]{1}" , tag ):
84
+ self .output += '* ' + LINEBR
67
85
if tag == 'i' or tag == 'em' :
68
- self .output += '_'
86
+ self .output += '_ '
69
87
if tag == 'a' :
70
88
self .output += '>'
71
89
if tag == 'code' :
72
90
self .output += '`'
73
91
if tag == 'style' or tag == 'script' :
74
92
self .skip = False
93
+ if tag == 'ul' :
94
+ self .isProcessingList = False
95
+ if tag == 'li' and self .isProcessingList :
96
+ self .output += LINEBR
97
+ if tag == 'ol' :
98
+ self .isProcessingOrderedList = False
99
+ if tag == 'li' and self .isProcessingOrderedList :
100
+ self .output += LINEBR
75
101
76
102
def handle_data (self , data ):
77
103
"""
@@ -105,4 +131,12 @@ def get_output(self):
105
131
link: https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
106
132
:return:
107
133
"""
108
- return ' ' .join (self .output .split ()).replace (LINEBR , "\n " )
134
+ output = self .output
135
+ output = re .sub (r'\*(\s\*)+' , '*' , output )
136
+ output = re .sub (r'_( _)+' , '_' , output )
137
+ output = output .replace ('[] ' , '☐ ' ).replace ('[x] ' , '☑︎ ' )
138
+ output = ' ' .join (output .split ())
139
+ output = output .replace (LINEBR , "\n " )
140
+ output = re .sub (r' *\n *' , '\n ' , output )
141
+ output = output .strip ()
142
+ return output
0 commit comments