Init

bebound · Nov 25, 2018 · b531704 · b531704
commit b531704
Show file tree

Hide file tree

Showing 5 changed files with 256 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,117 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+covers
+.idea
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 KK
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/main.py b/main.py
@@ -0,0 +1,113 @@
+import os
+import re
+from multiprocessing import Pool
+
+import requests
+
+from pyquery import PyQuery as pq
+from tqdm import trange, tqdm
+
+
+def extract_books_from_url(url):
+    print('Processing', url)
+    r = requests.get(url)
+    d = pq(r.text)
+    h3 = d('h3.product-hdg a')
+    result = []
+    for i in h3:
+        title = pq(i).text().strip()
+        url = pq(i).attr('href')
+        result.append({'title': title, 'url': url})
+    return result
+
+
+def generate_urls_by_series_page(series, max_page):
+    for i in range(1, max_page + 1):
+        yield 'https://bookwalker.jp/series/{}/page{}/'.format(series, i)
+
+
+def extract_books_from_series(series):
+    """extract book url by series_number https://bookwalker.jp/series/4206/
+    :param series str/int
+    :return (series_title,[{'tile':xxx,'url':xxx}])
+    """
+    r = requests.get('https://bookwalker.jp/series/{}/'.format(series))
+    d = pq(r.text)
+    series_title = (d('span.overview-hdg-txt')).text()
+    print(series_title)
+    if d('ul.pager-num li:last a'):
+        max_page = int(d('ul.pager-num li:last a').text())
+    else:
+        max_page = 1
+    books = []
+    print('Total page number', max_page)
+    for url in generate_urls_by_series_page(series, max_page):
+        books.extend(extract_books_from_url(url))
+    return series_title, books
+
+
+def decode_cover_number(number):
+    return int(str(number)[::-1]) - 1
+
+
+def download_cover(folder, book):
+    """
+    :param folder: str folder_name
+    :param book: {'title':xxx,'url':xxx}
+    :return:
+    """
+    r = requests.get(book['url'])
+    cover_number = re.search(r'<meta property="og:image" content="https://c.bookwalker.jp/(\d+)/t_700x780.jpg">',
+                             r.text).group(1)
+    ori_number = decode_cover_number(cover_number)
+    url = 'https://c.bookwalker.jp/coverImage_{}.jpg'.format(ori_number)
+    filename, ext = url.split('/')[-1].split('.')
+    new_filename = '{filename} {title}.{ext}'.format(filename=filename, title=book['title'], ext=ext)
+    filepath = os.path.join(folder, new_filename)
+    if not os.path.exists(filepath):
+        r = requests.get(url)
+        with open(filepath, 'wb') as f:
+            f.write(r.content)
+
+
+def update():
+    folders = os.listdir('./')
+    series = []
+    for folder in folders:
+        if re.match(r'\d+ .+', folder):
+            series.append(int(folder.split()[0]))
+    series.sort()
+
+    print(series)
+    for i in series:
+        download_by_series(i)
+
+
+def download_by_series(series):
+    """
+    :param series: str/int
+    """
+    series_title, books = extract_books_from_series(series)
+    folder = './covers/{} {}'.format(series, series_title)
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+    p = Pool(4)
+    bar = trange(len(books))
+    for i in books:
+        p.apply_async(download_cover, [folder, i], callback=lambda x: bar.update(1))
+    p.close()
+    p.join()
+    bar.close()
+
+
+def main():
+    user_input = input('Input series/update:')
+    if user_input == 'update':
+        update()
+    else:
+        for series in user_input.split(','):
+            download_by_series(series)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+pyquery
+requests
+tqdm
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Auto detect text files and perform LF normalization
		* text=auto