Skip to content

Commit a4eb3b6

Browse files
committed
Xlsx Controller
1 parent 7ca5b66 commit a4eb3b6

File tree

9 files changed

+183
-18
lines changed

9 files changed

+183
-18
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@ sys.cfg
99

1010
test/
1111

12+
*.xlsx
13+
nohup.out

crawler_main.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@
1717
if len(argv) == 1: crawler_name = "sample"
1818
elif len(argv) == 2: crawler_name = argv[1]
1919

20-
crawler = Do(crawler_name)
20+
crawler = Do(crawler_name)
21+
crawler.rds_to_xlsx("{}.xlsx".format(crawler_name), crawler_name)

crawler_test.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# -*- coding: utf-8 -*-
2+
from util.xlsx.reader import XlsxReader
3+
from util.xlsx.writer import write_xlsx
4+
import re, json
5+
6+
if __name__ == "__main__":
7+
keys = [
8+
'lat', 'busi_area', 'payment', 'floor',
9+
'house_id', 'price', 'house_type', 'area',
10+
'lng', '月付', '季付', '半年付', '年付'
11+
]
12+
reader = XlsxReader("ziroom.xlsx")
13+
v = reader.get_sheet_data("ziroom")
14+
15+
with write_xlsx("ziroom_clean.xlsx", "ziroom") as f:
16+
for s in v:
17+
for idx, sv in enumerate(s):
18+
if idx == 1 and sv is not None:
19+
[s.append(x) for x in json.loads(sv.replace('\'','\"'))]
20+
s.pop(1)
21+
f.write_dict(dict(zip(keys, s)))

do/do.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from module.request.http import HTTPListRequest, HTTPDetailRequest
55
from module.parser.detail import ParserDetail
66
from module.parser.list import ParserList
7+
from module.output.rds_to_xlsx import RdsToXlsx
78

89
from util.redis import RedisController
910

@@ -33,7 +34,7 @@ def __init__(self, crawler_name):
3334
# init
3435
self.__load__
3536

36-
self.do() # debug code
37+
# self.do() # debug code
3738

3839
def do(self):
3940
'''do
@@ -93,4 +94,7 @@ def __parser_detail__(self):
9394
Parse the data from req of detail websites/APIs
9495
'''
9596
parser = ParserDetail(self.detail_res_iter, self.crawler_conf, self.rds)
96-
parser.save
97+
parser.save
98+
99+
def rds_to_xlsx(self, file_name, sheet_name):
100+
RdsToXlsx.save(self.rds, file_name, sheet_name)

module/output/rds_to_xlsx.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from util.redis import RedisController
4+
from util.xlsx.writer import write_xlsx
5+
import json
6+
7+
class RdsToXlsx():
8+
9+
@staticmethod
10+
def save(rds, file_name, sheet_name):
11+
with write_xlsx(file_name, sheet_name) as x:
12+
for data in rds.rscan:
13+
x.write_dict(json.loads(data[1]))

module/request/http.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def list_res_iter(self):
5757
finally:
5858
yield res
5959

60-
if cursor > int(total):
60+
if cursor > int(total):
6161
break
6262

6363
# break # debug code
@@ -96,28 +96,22 @@ def __mutil_req__(method, mutil, crawler, **kwargs):
9696
req with different config.
9797
'''
9898
if 'cpath' in kwargs:
99-
cpath = kwargs['cpath']
99+
cpath = kwargs['cpath']
100100

101101
if 'params' in crawler.keys():
102-
params = int(crawler['params'])
102+
params = int(crawler['params'])
103103

104104
if 'list_url' in crawler.keys():
105-
url_tpl = crawler['list_url']
106-
107-
# if 'childpath' in crawler.keys():
108-
# childpath = crawler['childpath'].split('.')
105+
url_tpl = crawler['list_url']
109106

110107
if 'pageshow' in crawler.keys():
111108
pageshow = int(crawler['pageshow'])
112109

113110
if 'data' in crawler.keys():
114-
data = json.loads(crawler['data'])
111+
data = json.loads(crawler['data'])
115112

116113
if 'data_key' in crawler.keys():
117-
data_key = json.loads(crawler['data_key'])
118-
119-
# if 'total' in crawler.keys():
120-
# totaler = crawler['total']
114+
data_key = json.loads(crawler['data_key'])
121115

122116
if method == 1:
123117
yield from HTTPListRequest.__req_get_api__(mutil, url_tpl, params)

util/common/tools.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,4 @@ def finder(result, find):
1919
return result
2020
except Exception as e:
2121
print("Err:{}********\n{}\n{}\n".format(e, result, find))
22-
raise e
23-
24-
# def lxmlfinder()
22+
raise e

util/xlsx/reader.py

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from openpyxl import load_workbook
4+
5+
class XlsxReader():
6+
7+
def __init__(self, filename):
8+
self._wb = load_workbook(filename)
9+
10+
@property
11+
def sheetnames(self):
12+
return self._wb.sheetnames
13+
14+
def get_sheet_content(self, sheetname):
15+
return self._wb[sheetname]
16+
17+
def get_titles(self, sheetname):
18+
return [title.value for title in self.get_sheet_content(sheetname)["1"]]
19+
20+
@property
21+
def titles(self):
22+
'''迭代取'''
23+
for sheetname in self.sheetnames:
24+
yield {sheetname:self.get_titles(sheetname)}
25+
26+
@property
27+
def titles_dict(self):
28+
'''全量字典'''
29+
title_dict = dict()
30+
for y in self.titles:
31+
title_dict = dict(title_dict, **y)
32+
return title_dict
33+
34+
def max_row(self, sheetname):
35+
return self.get_sheet_content(sheetname).max_row
36+
37+
def max_column(self, sheetname):
38+
return self.get_sheet_content(sheetname).max_column
39+
40+
def get_sheet_data(self, sheetname):
41+
'''获取某一张表的所有内容'''
42+
for idx in range(2, self.max_row(sheetname)+1):
43+
yield [content.value for content in self.get_sheet_content(sheetname)[idx]]
44+
45+
def get_sheet_contents(self, sheetname):
46+
'''获取某一张表的所有内容'''
47+
sheet_data = self.get_sheet_data(sheetname)
48+
for data in sheet_data:
49+
if data[0] is not None:
50+
yield {str(data[0]):data[1:]}
51+
52+
def get_sheet_content_dict(self, sheetname):
53+
'''获取某一张表内容的全量字典'''
54+
return [y for y in self.get_sheet_contents(sheetname)]
55+
56+
@property
57+
def contents(self):
58+
'''全文件data'''
59+
for sheetname in self.sheetnames:
60+
yield {sheetname:self.get_sheet_content_dict(sheetname)}
61+
62+
@property
63+
def contents_dict(self):
64+
return [y for y in self.contents]
65+
66+
if __name__ == "__main__":
67+
x = XlsxReader("./test.xlsx")
68+
print(x.sheetnames)
69+
print(x.get_titles("1"))
70+
print(x.titles_dict)
71+
print(x.get_sheet_content_dict("1"))
72+
print(x.contents_dict)

util/xlsx/writer.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from openpyxl import Workbook
4+
from .reader import XlsxReader
5+
import os
6+
7+
class XlsxWriter(XlsxReader):
8+
9+
def __init__(self, filename, sheetname):
10+
self.filename = filename
11+
self.sheetname = sheetname
12+
XlsxReader.__init__(self, filename)
13+
self._sheet = self._wb.active
14+
self._title = {t:chr(i+65) for i, t in enumerate(self.titles_dict[sheetname])}
15+
self._title_cur = len(self._title.keys())
16+
self._cur = len(self.get_sheet_content_dict(self.sheetname)) + 2
17+
18+
def __enter__(self):
19+
print("*****")
20+
return self
21+
22+
def __exit__(self, type, value, traceback):
23+
self._wb.save(self.filename)
24+
25+
def write_dict(self, data):
26+
self.write_title(data)
27+
self.write_data(data)
28+
29+
def write_title(self, data):
30+
for k in data.keys():
31+
if k not in self._title.keys():
32+
self._title[k] = chr(65+self._title_cur)
33+
self._title_cur += 1
34+
for k, v in zip(self._title.keys(), self._title.values()):
35+
self._sheet["%s%d"%(v, 1)] = k
36+
37+
def write_data(self, data):
38+
for k, v in zip(data.keys(), data.values()):
39+
self._sheet["%s%d"%(self._title[k], self._cur)] = str(v)
40+
self._cur += 1
41+
42+
43+
def write_xlsx(filename, sheetname):
44+
if not os.path.exists(filename):
45+
wb = Workbook()
46+
st = wb.active
47+
st.title = sheetname
48+
wb.save(filename)
49+
50+
return XlsxWriter(filename, sheetname)
51+
52+
if __name__ == "__main__":
53+
with write_xlsx("./test.xlsx", "test") as x:
54+
x.write_dict(
55+
{
56+
"test1":'Test1',
57+
"test2":'Test2',
58+
"house_id":1230601
59+
}
60+
)

0 commit comments

Comments
 (0)