Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
xoolive committed Aug 1, 2021
0 parents commit 08d227b
Show file tree
Hide file tree
Showing 12 changed files with 733 additions and 0 deletions.
16 changes: 16 additions & 0 deletions kiosque.code-workspace
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"folders": [{ "path": "." }],
"settings": {
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.linting.mypyEnabled": true,
"python.linting.enabled": true,

"editor.formatOnSave": true,
"python.formatting.provider": "black",

"python.testing.unittestEnabled": false,
"python.testing.nosetestsEnabled": false,
"python.testing.pytestEnabled": true
}
}
49 changes: 49 additions & 0 deletions kiosque/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# from .aviationweek import AviationWeek
import argparse

from .config import config_dict
from .courrierinternational import CourrierInternational
from .lemonde import LeMonde
from .mondediplomatique import MondeDiplomatique
from .pourlascience import PourLaScience


def main():

newspapers = {
"courrierinternational": CourrierInternational,
"mondediplomatique": MondeDiplomatique,
"pourlascience": PourLaScience,
"lemonde": LeMonde,
}

for key, value in config_dict.items():
if "alias" in value:
for alias in value["alias"].split(","):
newspapers[alias.strip()] = newspapers[key]
del value["alias"]
newspapers[key].credentials = value

parser = argparse.ArgumentParser(description="news command-line interface")

parser.add_argument(
"args",
nargs=argparse.REMAINDER,
help="all arguments to dispatch to command",
)

parser.add_argument(
"-l",
dest="list",
action="store_true",
help="print the name of the latest edition without downloading",
)

args = parser.parse_args()

getter = newspapers[args.args[0]]()
getter.login()
if len(args.args) > 1:
getter.get_content(args.args[1])
elif not args.list:
getter.save_latest_issue()
40 changes: 40 additions & 0 deletions kiosque/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import configparser
from pathlib import Path
from typing import Dict

from appdirs import user_config_dir

config_dir = Path(user_config_dir("kiosque"))
config_file = config_dir / "kiosque.conf"

if not config_dir.exists():
config_template = """
[aviationweek]
alias = awst
[courrierinternational]
alias = courrier
name =
pass =
[mondediplomatique]
alias = diplomatique, diplo, lmd
email =
mot_de_passe =
[pourlascience]
alias = pls
email =
password =
"""
config_dir.mkdir(parents=True)
config_file.write_text(config_template)

config = configparser.RawConfigParser()
config.read(config_file.as_posix())

config_dict: Dict[str, Dict[str, str]] = dict()

for key, value in config.items():
if key != "DEFAULT":
config_dict[key] = dict((key, value) for key, value in value.items())
42 changes: 42 additions & 0 deletions kiosque/courrierinternational.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from functools import lru_cache

from bs4 import BeautifulSoup

from .download import Download


class CourrierInternational(Download):

base_url = "https://www.courrierinternational.com"
login_url = base_url + "/login?destination=node/6"

def login_dict(self):
c = self.session.get(self.login_url)
c.raise_for_status()

e = BeautifulSoup(c.content, features="lxml")
attrs = dict(name="form_build_id")
form_id = e.find("input", attrs=attrs).attrs["value"]

return {
"remember_me": "1",
"form_build_id": form_id,
"form_id": "user_login",
"op": "Se connecter",
**self.credentials,
}

@lru_cache()
def latest_issue_url(self):
c = self.session.get(self.base_url + "/magazine")
c.raise_for_status()
e = BeautifulSoup(c.content, features="lxml")

x = e.find("article", attrs={"class": "item hebdo"})

c = self.session.get(self.base_url + x.find("a").attrs["href"])
c.raise_for_status()
e = BeautifulSoup(c.content, features="lxml")
attrs = {"class": "issue-download"}

return e.find("a", attrs=attrs).attrs["href"]
58 changes: 58 additions & 0 deletions kiosque/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from pathlib import Path
from typing import Dict

import requests


class Download:

login_url: str
credentials: Dict[str, str]

def __init__(self) -> None:
self.connected = False
self.latest_issue = None
self.session: requests.Session = requests.Session()
self.session.headers.update(
{
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) "
"Gecko/20100101 Firefox/72.0"
)
}
)

def login_dict(self) -> Dict[str, str]:
return {}

def login(self) -> None:
c = self.session.post(self.login_url, data=self.login_dict())
c.raise_for_status()
self.connected = True

def latest_issue_url(self) -> str:
raise NotImplementedError()

def file_name(self, c) -> str:
if not self.connected:
self.login()
url = self.latest_issue_url()
return Path(url).name

def get_latest_issue(self):
if self.latest_issue is not None:
return self.latest_issue
if not self.connected:
self.login()
url = self.latest_issue_url()
c = self.session.get(url)
return c

def save_latest_issue(self):
c = self.get_latest_issue()
full_path = Path(".") / self.file_name(c)
full_path.write_bytes(c.content)
print(f"File written: {full_path}")

def get_content(self, url):
return
91 changes: 91 additions & 0 deletions kiosque/lemonde.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import re
from pathlib import Path

import pypandoc
from bs4 import BeautifulSoup

from .download import Download


class LeMonde(Download):

base_url = "https://www.lemonde.fr/"
login_url = "https://secure.lemonde.fr/sfuser/connexion"

def login_dict(self):
c = self.session.get(self.login_url)
c.raise_for_status()

e = BeautifulSoup(c.content, features="lxml")
attrs = dict(name="connection[_token]")
token = e.find("input", attrs=attrs).attrs["value"]

return {
"connection[mail]": self.credentials["mail"],
"connection[password]": self.credentials["password"],
"connection[stay_connected]": 1,
"connection[save]": "",
"connection[newsletters]": [],
"connection[_token]": token,
}

def get_content(self, url):
c = self.session.get(url)

m = re.search(r"\d{4}/\d{2}/\d{2}", url)
assert m is not None
date = m.group().replace("/", "-")

filename = f"{date}-{url.split('/')[-1].replace('.html', '.md')}"

e = BeautifulSoup(c.content, features="lxml")

# main_body = e.find("section", attrs={"class": "zone--article"})

title = e.find("h1")
author = e.find("a", attrs={"class": "article__author"})
if author is None:
author = e.find("a", attrs={"class": "article__author-link"})
desc = e.find("p", attrs={"class": "article__desc"})

header = f"""---
title: {title.text.strip()}
author: {author.text.strip()}
date: {date}
header: {desc.text.strip()}
---
"""

article = e.find("article", attrs={"class": "article__content"})
if article is None:
article = e.find("section", attrs={"class": "article__content"})
else:
embedded = article.find(
"section", attrs={"class": "article__content"}
)
if embedded is not None:
article = embedded

for x in article.find_all("section", attrs={"class": "catcher"}):
x.decompose()
for x in article.find_all("section", attrs={"class": "author"}):
x.decompose()
for x in article.find_all(
"section", attrs={"class": "article__reactions"}
):
x.decompose()
for x in article.find_all("div", attrs={"class": "dfp__inread"}):
x.decompose()

for x in article.find_all("h2"):
x.attrs.clear()
for x in article.find_all("h3"):
x.name = "blockquote"
x.attrs.clear()
for x in article.find_all("figure"):
x.decompose()

output = pypandoc.convert_text(article, "md", format="html")

print(filename)
Path(filename).write_text(f"{header}\n\n{output}")
56 changes: 56 additions & 0 deletions kiosque/mondediplomatique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from functools import lru_cache
from urllib.parse import unquote

from bs4 import BeautifulSoup

from .download import Download


class MondeDiplomatique(Download):

base_url = "https://www.monde-diplomatique.fr/"
login_url = "https://lecteurs.mondediplo.net/?page=connexion_sso"

def login_dict(self):
c = self.session.get(self.login_url)
c.raise_for_status()

e = BeautifulSoup(c.content, features="lxml")
attrs = dict(name="formulaire_action_args")
form_id = e.find("input", attrs=attrs).attrs["value"]

return {
"formulaire_action": "identification_sso",
"formulaire_action_args": form_id,
"retour": "https://www.monde-diplomatique.fr/",
"site_distant": "https://www.monde-diplomatique.fr/",
"valider": "Valider",
**self.credentials,
}

@lru_cache()
def latest_issue_url(self):

c = self.session.get(self.base_url)
c.raise_for_status()

e = BeautifulSoup(c.content, features="lxml")

current = e.find("a", attrs={"id": "entree-numero"}).attrs["href"]

c = self.session.get(self.base_url + current)
c.raise_for_status()

e = BeautifulSoup(c.content, features="lxml")

attrs = {"class": "format PDF"}
url = e.find("div", attrs=attrs).find("a").attrs["href"]
return self.base_url + url

def file_name(self, c) -> str:
return unquote(
c.headers["Content-Disposition"]
.split(";")[1]
.split("=")[1]
.strip('"')
)
Loading

0 comments on commit 08d227b

Please sign in to comment.