initial commit

xoolive · Aug 1, 2021 · 08d227b · 08d227b
commit 08d227b
Show file tree

Hide file tree

Showing 12 changed files with 733 additions and 0 deletions.
diff --git a/kiosque.code-workspace b/kiosque.code-workspace
@@ -0,0 +1,16 @@
+{
+  "folders": [{ "path": "." }],
+  "settings": {
+    "python.linting.pylintEnabled": false,
+    "python.linting.flake8Enabled": true,
+    "python.linting.mypyEnabled": true,
+    "python.linting.enabled": true,
+
+    "editor.formatOnSave": true,
+    "python.formatting.provider": "black",
+
+    "python.testing.unittestEnabled": false,
+    "python.testing.nosetestsEnabled": false,
+    "python.testing.pytestEnabled": true
+  }
+}
diff --git a/kiosque/__init__.py b/kiosque/__init__.py
@@ -0,0 +1,49 @@
+# from .aviationweek import AviationWeek
+import argparse
+
+from .config import config_dict
+from .courrierinternational import CourrierInternational
+from .lemonde import LeMonde
+from .mondediplomatique import MondeDiplomatique
+from .pourlascience import PourLaScience
+
+
+def main():
+
+    newspapers = {
+        "courrierinternational": CourrierInternational,
+        "mondediplomatique": MondeDiplomatique,
+        "pourlascience": PourLaScience,
+        "lemonde": LeMonde,
+    }
+
+    for key, value in config_dict.items():
+        if "alias" in value:
+            for alias in value["alias"].split(","):
+                newspapers[alias.strip()] = newspapers[key]
+            del value["alias"]
+        newspapers[key].credentials = value
+
+    parser = argparse.ArgumentParser(description="news command-line interface")
+
+    parser.add_argument(
+        "args",
+        nargs=argparse.REMAINDER,
+        help="all arguments to dispatch to command",
+    )
+
+    parser.add_argument(
+        "-l",
+        dest="list",
+        action="store_true",
+        help="print the name of the latest edition without downloading",
+    )
+
+    args = parser.parse_args()
+
+    getter = newspapers[args.args[0]]()
+    getter.login()
+    if len(args.args) > 1:
+        getter.get_content(args.args[1])
+    elif not args.list:
+        getter.save_latest_issue()
diff --git a/kiosque/config.py b/kiosque/config.py
@@ -0,0 +1,40 @@
+import configparser
+from pathlib import Path
+from typing import Dict
+
+from appdirs import user_config_dir
+
+config_dir = Path(user_config_dir("kiosque"))
+config_file = config_dir / "kiosque.conf"
+
+if not config_dir.exists():
+    config_template = """
+[aviationweek]
+alias = awst
+
+[courrierinternational]
+alias = courrier
+name =
+pass =
+
+[mondediplomatique]
+alias = diplomatique, diplo, lmd
+email =
+mot_de_passe =
+
+[pourlascience]
+alias = pls
+email =
+password =
+    """
+    config_dir.mkdir(parents=True)
+    config_file.write_text(config_template)
+
+config = configparser.RawConfigParser()
+config.read(config_file.as_posix())
+
+config_dict: Dict[str, Dict[str, str]] = dict()
+
+for key, value in config.items():
+    if key != "DEFAULT":
+        config_dict[key] = dict((key, value) for key, value in value.items())
diff --git a/kiosque/courrierinternational.py b/kiosque/courrierinternational.py
@@ -0,0 +1,42 @@
+from functools import lru_cache
+
+from bs4 import BeautifulSoup
+
+from .download import Download
+
+
+class CourrierInternational(Download):
+
+    base_url = "https://www.courrierinternational.com"
+    login_url = base_url + "/login?destination=node/6"
+
+    def login_dict(self):
+        c = self.session.get(self.login_url)
+        c.raise_for_status()
+
+        e = BeautifulSoup(c.content, features="lxml")
+        attrs = dict(name="form_build_id")
+        form_id = e.find("input", attrs=attrs).attrs["value"]
+
+        return {
+            "remember_me": "1",
+            "form_build_id": form_id,
+            "form_id": "user_login",
+            "op": "Se connecter",
+            **self.credentials,
+        }
+
+    @lru_cache()
+    def latest_issue_url(self):
+        c = self.session.get(self.base_url + "/magazine")
+        c.raise_for_status()
+        e = BeautifulSoup(c.content, features="lxml")
+
+        x = e.find("article", attrs={"class": "item hebdo"})
+
+        c = self.session.get(self.base_url + x.find("a").attrs["href"])
+        c.raise_for_status()
+        e = BeautifulSoup(c.content, features="lxml")
+        attrs = {"class": "issue-download"}
+
+        return e.find("a", attrs=attrs).attrs["href"]
diff --git a/kiosque/download.py b/kiosque/download.py
@@ -0,0 +1,58 @@
+from pathlib import Path
+from typing import Dict
+
+import requests
+
+
+class Download:
+
+    login_url: str
+    credentials: Dict[str, str]
+
+    def __init__(self) -> None:
+        self.connected = False
+        self.latest_issue = None
+        self.session: requests.Session = requests.Session()
+        self.session.headers.update(
+            {
+                "User-Agent": (
+                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) "
+                    "Gecko/20100101 Firefox/72.0"
+                )
+            }
+        )
+
+    def login_dict(self) -> Dict[str, str]:
+        return {}
+
+    def login(self) -> None:
+        c = self.session.post(self.login_url, data=self.login_dict())
+        c.raise_for_status()
+        self.connected = True
+
+    def latest_issue_url(self) -> str:
+        raise NotImplementedError()
+
+    def file_name(self, c) -> str:
+        if not self.connected:
+            self.login()
+        url = self.latest_issue_url()
+        return Path(url).name
+
+    def get_latest_issue(self):
+        if self.latest_issue is not None:
+            return self.latest_issue
+        if not self.connected:
+            self.login()
+        url = self.latest_issue_url()
+        c = self.session.get(url)
+        return c
+
+    def save_latest_issue(self):
+        c = self.get_latest_issue()
+        full_path = Path(".") / self.file_name(c)
+        full_path.write_bytes(c.content)
+        print(f"File written: {full_path}")
+
+    def get_content(self, url):
+        return
diff --git a/kiosque/lemonde.py b/kiosque/lemonde.py
@@ -0,0 +1,91 @@
+import re
+from pathlib import Path
+
+import pypandoc
+from bs4 import BeautifulSoup
+
+from .download import Download
+
+
+class LeMonde(Download):
+
+    base_url = "https://www.lemonde.fr/"
+    login_url = "https://secure.lemonde.fr/sfuser/connexion"
+
+    def login_dict(self):
+        c = self.session.get(self.login_url)
+        c.raise_for_status()
+
+        e = BeautifulSoup(c.content, features="lxml")
+        attrs = dict(name="connection[_token]")
+        token = e.find("input", attrs=attrs).attrs["value"]
+
+        return {
+            "connection[mail]": self.credentials["mail"],
+            "connection[password]": self.credentials["password"],
+            "connection[stay_connected]": 1,
+            "connection[save]": "",
+            "connection[newsletters]": [],
+            "connection[_token]": token,
+        }
+
+    def get_content(self, url):
+        c = self.session.get(url)
+
+        m = re.search(r"\d{4}/\d{2}/\d{2}", url)
+        assert m is not None
+        date = m.group().replace("/", "-")
+
+        filename = f"{date}-{url.split('/')[-1].replace('.html', '.md')}"
+
+        e = BeautifulSoup(c.content, features="lxml")
+
+        # main_body = e.find("section", attrs={"class": "zone--article"})
+
+        title = e.find("h1")
+        author = e.find("a", attrs={"class": "article__author"})
+        if author is None:
+            author = e.find("a", attrs={"class": "article__author-link"})
+        desc = e.find("p", attrs={"class": "article__desc"})
+
+        header = f"""---
+title: {title.text.strip()}
+author: {author.text.strip()}
+date: {date}
+header: {desc.text.strip()}
+---
+        """
+
+        article = e.find("article", attrs={"class": "article__content"})
+        if article is None:
+            article = e.find("section", attrs={"class": "article__content"})
+        else:
+            embedded = article.find(
+                "section", attrs={"class": "article__content"}
+            )
+            if embedded is not None:
+                article = embedded
+
+        for x in article.find_all("section", attrs={"class": "catcher"}):
+            x.decompose()
+        for x in article.find_all("section", attrs={"class": "author"}):
+            x.decompose()
+        for x in article.find_all(
+            "section", attrs={"class": "article__reactions"}
+        ):
+            x.decompose()
+        for x in article.find_all("div", attrs={"class": "dfp__inread"}):
+            x.decompose()
+
+        for x in article.find_all("h2"):
+            x.attrs.clear()
+        for x in article.find_all("h3"):
+            x.name = "blockquote"
+            x.attrs.clear()
+        for x in article.find_all("figure"):
+            x.decompose()
+
+        output = pypandoc.convert_text(article, "md", format="html")
+
+        print(filename)
+        Path(filename).write_text(f"{header}\n\n{output}")
diff --git a/kiosque/mondediplomatique.py b/kiosque/mondediplomatique.py
@@ -0,0 +1,56 @@
+from functools import lru_cache
+from urllib.parse import unquote
+
+from bs4 import BeautifulSoup
+
+from .download import Download
+
+
+class MondeDiplomatique(Download):
+
+    base_url = "https://www.monde-diplomatique.fr/"
+    login_url = "https://lecteurs.mondediplo.net/?page=connexion_sso"
+
+    def login_dict(self):
+        c = self.session.get(self.login_url)
+        c.raise_for_status()
+
+        e = BeautifulSoup(c.content, features="lxml")
+        attrs = dict(name="formulaire_action_args")
+        form_id = e.find("input", attrs=attrs).attrs["value"]
+
+        return {
+            "formulaire_action": "identification_sso",
+            "formulaire_action_args": form_id,
+            "retour": "https://www.monde-diplomatique.fr/",
+            "site_distant": "https://www.monde-diplomatique.fr/",
+            "valider": "Valider",
+            **self.credentials,
+        }
+
+    @lru_cache()
+    def latest_issue_url(self):
+
+        c = self.session.get(self.base_url)
+        c.raise_for_status()
+
+        e = BeautifulSoup(c.content, features="lxml")
+
+        current = e.find("a", attrs={"id": "entree-numero"}).attrs["href"]
+
+        c = self.session.get(self.base_url + current)
+        c.raise_for_status()
+
+        e = BeautifulSoup(c.content, features="lxml")
+
+        attrs = {"class": "format PDF"}
+        url = e.find("div", attrs=attrs).find("a").attrs["href"]
+        return self.base_url + url
+
+    def file_name(self, c) -> str:
+        return unquote(
+            c.headers["Content-Disposition"]
+            .split(";")[1]
+            .split("=")[1]
+            .strip('"')
+        )