Skip to content

Commit 7262d2e

Browse files
committed
[topstarnews] Fix article extraction
1 parent 67c0d65 commit 7262d2e

File tree

2 files changed

+5
-25
lines changed

2 files changed

+5
-25
lines changed

extractor/topstarnews.py

+3-21
Original file line numberDiff line numberDiff line change
@@ -62,28 +62,10 @@ def metadata(self, page):
6262
'"',
6363
)[0],
6464
)
65-
or text.parse_datetime(
66-
text.extr(
67-
page,
68-
'<i class="fa fa-clock-o fa-fw"></i>',
69-
"</li>",
70-
)
71-
.strip()
72-
.split(" ", maxsplit=1)[1],
73-
format="%Y.%m.%d %H:%M",
74-
utcoffset=9,
75-
)
76-
),
77-
"author": text.extr(
78-
page,
79-
'<i class="fa fa-user-o fa-fw"></i>',
80-
"</li>",
81-
).strip(),
82-
"views": text.parse_int(
83-
text.extr(page, '<i class="fa fa-desktop fa-fw"></i>', "</li>").strip().split(" ", maxsplit=1)[1],
8465
),
66+
"author": text.extr(page, ' name="author" content="', '"').strip().replace(" 기자", ""),
8567
"post_id": self.post_id,
86-
"post_url": self.url,
68+
"post_url": self.post_url,
8769
}
8870
if ' name="keywords" content="' in page:
8971
data["tags"] = text.extr(page, ' name="keywords" content="', '"').split(",")
@@ -97,7 +79,7 @@ def items(self):
9779

9880
yield Message.Directory, data
9981

100-
article_body = text.extr(page, ' itemprop="articleBody">', '<div id="article-sns2"')
82+
article_body = text.extr(page, 'itemprop="articleBody"', "</article>")
10183

10284
images = [
10385
text.extr(figure, "<img", ">")

test/results/topstarnews.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@
1515
"date": "dt:2024-09-11 10:54:00",
1616
"title": "레드벨벳 웬디, ‘리본 하트 해달랬더니 근육 몽몽이가 돼버린 와니’ (웬디의 영스트리트 출근길)",
1717
"tags": ["웬디", "WENDY", "영스트리트", "출근", "퇴근", "프리뷰"],
18-
"author": "최규석 기자",
19-
"views": int,
18+
"author": "최규석",
2019
"post_id": "15543685",
2120
"post_url": "https://www.topstarnews.net/news/articleView.html?idxno=15543685",
2221
},
@@ -29,8 +28,7 @@
2928
"#count": 1,
3029
"date": "dt:2012-04-24 06:45:00",
3130
"title": "걸스데이(Girls Day) 혜리, '남자들은 다 똑같아!' 깜찍한 무대 …MBC MUSIC 쇼 챔피언 생방송 현장",
32-
"author": "최규석 기자",
33-
"views": int,
31+
"author": "최규석",
3432
"post_id": "30789",
3533
"post_url": "https://www.topstarnews.net/news/articleView.html?idxno=30789",
3634
},

0 commit comments

Comments
 (0)