-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 08d227b
Showing
12 changed files
with
733 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"folders": [{ "path": "." }], | ||
"settings": { | ||
"python.linting.pylintEnabled": false, | ||
"python.linting.flake8Enabled": true, | ||
"python.linting.mypyEnabled": true, | ||
"python.linting.enabled": true, | ||
|
||
"editor.formatOnSave": true, | ||
"python.formatting.provider": "black", | ||
|
||
"python.testing.unittestEnabled": false, | ||
"python.testing.nosetestsEnabled": false, | ||
"python.testing.pytestEnabled": true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# from .aviationweek import AviationWeek | ||
import argparse | ||
|
||
from .config import config_dict | ||
from .courrierinternational import CourrierInternational | ||
from .lemonde import LeMonde | ||
from .mondediplomatique import MondeDiplomatique | ||
from .pourlascience import PourLaScience | ||
|
||
|
||
def main(): | ||
|
||
newspapers = { | ||
"courrierinternational": CourrierInternational, | ||
"mondediplomatique": MondeDiplomatique, | ||
"pourlascience": PourLaScience, | ||
"lemonde": LeMonde, | ||
} | ||
|
||
for key, value in config_dict.items(): | ||
if "alias" in value: | ||
for alias in value["alias"].split(","): | ||
newspapers[alias.strip()] = newspapers[key] | ||
del value["alias"] | ||
newspapers[key].credentials = value | ||
|
||
parser = argparse.ArgumentParser(description="news command-line interface") | ||
|
||
parser.add_argument( | ||
"args", | ||
nargs=argparse.REMAINDER, | ||
help="all arguments to dispatch to command", | ||
) | ||
|
||
parser.add_argument( | ||
"-l", | ||
dest="list", | ||
action="store_true", | ||
help="print the name of the latest edition without downloading", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
getter = newspapers[args.args[0]]() | ||
getter.login() | ||
if len(args.args) > 1: | ||
getter.get_content(args.args[1]) | ||
elif not args.list: | ||
getter.save_latest_issue() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import configparser | ||
from pathlib import Path | ||
from typing import Dict | ||
|
||
from appdirs import user_config_dir | ||
|
||
config_dir = Path(user_config_dir("kiosque")) | ||
config_file = config_dir / "kiosque.conf" | ||
|
||
if not config_dir.exists(): | ||
config_template = """ | ||
[aviationweek] | ||
alias = awst | ||
[courrierinternational] | ||
alias = courrier | ||
name = | ||
pass = | ||
[mondediplomatique] | ||
alias = diplomatique, diplo, lmd | ||
email = | ||
mot_de_passe = | ||
[pourlascience] | ||
alias = pls | ||
email = | ||
password = | ||
""" | ||
config_dir.mkdir(parents=True) | ||
config_file.write_text(config_template) | ||
|
||
config = configparser.RawConfigParser() | ||
config.read(config_file.as_posix()) | ||
|
||
config_dict: Dict[str, Dict[str, str]] = dict() | ||
|
||
for key, value in config.items(): | ||
if key != "DEFAULT": | ||
config_dict[key] = dict((key, value) for key, value in value.items()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from functools import lru_cache | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from .download import Download | ||
|
||
|
||
class CourrierInternational(Download): | ||
|
||
base_url = "https://www.courrierinternational.com" | ||
login_url = base_url + "/login?destination=node/6" | ||
|
||
def login_dict(self): | ||
c = self.session.get(self.login_url) | ||
c.raise_for_status() | ||
|
||
e = BeautifulSoup(c.content, features="lxml") | ||
attrs = dict(name="form_build_id") | ||
form_id = e.find("input", attrs=attrs).attrs["value"] | ||
|
||
return { | ||
"remember_me": "1", | ||
"form_build_id": form_id, | ||
"form_id": "user_login", | ||
"op": "Se connecter", | ||
**self.credentials, | ||
} | ||
|
||
@lru_cache() | ||
def latest_issue_url(self): | ||
c = self.session.get(self.base_url + "/magazine") | ||
c.raise_for_status() | ||
e = BeautifulSoup(c.content, features="lxml") | ||
|
||
x = e.find("article", attrs={"class": "item hebdo"}) | ||
|
||
c = self.session.get(self.base_url + x.find("a").attrs["href"]) | ||
c.raise_for_status() | ||
e = BeautifulSoup(c.content, features="lxml") | ||
attrs = {"class": "issue-download"} | ||
|
||
return e.find("a", attrs=attrs).attrs["href"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from pathlib import Path | ||
from typing import Dict | ||
|
||
import requests | ||
|
||
|
||
class Download: | ||
|
||
login_url: str | ||
credentials: Dict[str, str] | ||
|
||
def __init__(self) -> None: | ||
self.connected = False | ||
self.latest_issue = None | ||
self.session: requests.Session = requests.Session() | ||
self.session.headers.update( | ||
{ | ||
"User-Agent": ( | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) " | ||
"Gecko/20100101 Firefox/72.0" | ||
) | ||
} | ||
) | ||
|
||
def login_dict(self) -> Dict[str, str]: | ||
return {} | ||
|
||
def login(self) -> None: | ||
c = self.session.post(self.login_url, data=self.login_dict()) | ||
c.raise_for_status() | ||
self.connected = True | ||
|
||
def latest_issue_url(self) -> str: | ||
raise NotImplementedError() | ||
|
||
def file_name(self, c) -> str: | ||
if not self.connected: | ||
self.login() | ||
url = self.latest_issue_url() | ||
return Path(url).name | ||
|
||
def get_latest_issue(self): | ||
if self.latest_issue is not None: | ||
return self.latest_issue | ||
if not self.connected: | ||
self.login() | ||
url = self.latest_issue_url() | ||
c = self.session.get(url) | ||
return c | ||
|
||
def save_latest_issue(self): | ||
c = self.get_latest_issue() | ||
full_path = Path(".") / self.file_name(c) | ||
full_path.write_bytes(c.content) | ||
print(f"File written: {full_path}") | ||
|
||
def get_content(self, url): | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import re | ||
from pathlib import Path | ||
|
||
import pypandoc | ||
from bs4 import BeautifulSoup | ||
|
||
from .download import Download | ||
|
||
|
||
class LeMonde(Download): | ||
|
||
base_url = "https://www.lemonde.fr/" | ||
login_url = "https://secure.lemonde.fr/sfuser/connexion" | ||
|
||
def login_dict(self): | ||
c = self.session.get(self.login_url) | ||
c.raise_for_status() | ||
|
||
e = BeautifulSoup(c.content, features="lxml") | ||
attrs = dict(name="connection[_token]") | ||
token = e.find("input", attrs=attrs).attrs["value"] | ||
|
||
return { | ||
"connection[mail]": self.credentials["mail"], | ||
"connection[password]": self.credentials["password"], | ||
"connection[stay_connected]": 1, | ||
"connection[save]": "", | ||
"connection[newsletters]": [], | ||
"connection[_token]": token, | ||
} | ||
|
||
def get_content(self, url): | ||
c = self.session.get(url) | ||
|
||
m = re.search(r"\d{4}/\d{2}/\d{2}", url) | ||
assert m is not None | ||
date = m.group().replace("/", "-") | ||
|
||
filename = f"{date}-{url.split('/')[-1].replace('.html', '.md')}" | ||
|
||
e = BeautifulSoup(c.content, features="lxml") | ||
|
||
# main_body = e.find("section", attrs={"class": "zone--article"}) | ||
|
||
title = e.find("h1") | ||
author = e.find("a", attrs={"class": "article__author"}) | ||
if author is None: | ||
author = e.find("a", attrs={"class": "article__author-link"}) | ||
desc = e.find("p", attrs={"class": "article__desc"}) | ||
|
||
header = f"""--- | ||
title: {title.text.strip()} | ||
author: {author.text.strip()} | ||
date: {date} | ||
header: {desc.text.strip()} | ||
--- | ||
""" | ||
|
||
article = e.find("article", attrs={"class": "article__content"}) | ||
if article is None: | ||
article = e.find("section", attrs={"class": "article__content"}) | ||
else: | ||
embedded = article.find( | ||
"section", attrs={"class": "article__content"} | ||
) | ||
if embedded is not None: | ||
article = embedded | ||
|
||
for x in article.find_all("section", attrs={"class": "catcher"}): | ||
x.decompose() | ||
for x in article.find_all("section", attrs={"class": "author"}): | ||
x.decompose() | ||
for x in article.find_all( | ||
"section", attrs={"class": "article__reactions"} | ||
): | ||
x.decompose() | ||
for x in article.find_all("div", attrs={"class": "dfp__inread"}): | ||
x.decompose() | ||
|
||
for x in article.find_all("h2"): | ||
x.attrs.clear() | ||
for x in article.find_all("h3"): | ||
x.name = "blockquote" | ||
x.attrs.clear() | ||
for x in article.find_all("figure"): | ||
x.decompose() | ||
|
||
output = pypandoc.convert_text(article, "md", format="html") | ||
|
||
print(filename) | ||
Path(filename).write_text(f"{header}\n\n{output}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from functools import lru_cache | ||
from urllib.parse import unquote | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from .download import Download | ||
|
||
|
||
class MondeDiplomatique(Download): | ||
|
||
base_url = "https://www.monde-diplomatique.fr/" | ||
login_url = "https://lecteurs.mondediplo.net/?page=connexion_sso" | ||
|
||
def login_dict(self): | ||
c = self.session.get(self.login_url) | ||
c.raise_for_status() | ||
|
||
e = BeautifulSoup(c.content, features="lxml") | ||
attrs = dict(name="formulaire_action_args") | ||
form_id = e.find("input", attrs=attrs).attrs["value"] | ||
|
||
return { | ||
"formulaire_action": "identification_sso", | ||
"formulaire_action_args": form_id, | ||
"retour": "https://www.monde-diplomatique.fr/", | ||
"site_distant": "https://www.monde-diplomatique.fr/", | ||
"valider": "Valider", | ||
**self.credentials, | ||
} | ||
|
||
@lru_cache() | ||
def latest_issue_url(self): | ||
|
||
c = self.session.get(self.base_url) | ||
c.raise_for_status() | ||
|
||
e = BeautifulSoup(c.content, features="lxml") | ||
|
||
current = e.find("a", attrs={"id": "entree-numero"}).attrs["href"] | ||
|
||
c = self.session.get(self.base_url + current) | ||
c.raise_for_status() | ||
|
||
e = BeautifulSoup(c.content, features="lxml") | ||
|
||
attrs = {"class": "format PDF"} | ||
url = e.find("div", attrs=attrs).find("a").attrs["href"] | ||
return self.base_url + url | ||
|
||
def file_name(self, c) -> str: | ||
return unquote( | ||
c.headers["Content-Disposition"] | ||
.split(";")[1] | ||
.split("=")[1] | ||
.strip('"') | ||
) |
Oops, something went wrong.