Skip to content

Commit

Permalink
Merge pull request #24 from sean1832/0.3
Browse files Browse the repository at this point in the history
Fix critical bugs in search command and improve query parameter handling
  • Loading branch information
sean1832 authored Nov 23, 2024
2 parents 4053beb + 9253e6f commit 3148e19
Show file tree
Hide file tree
Showing 12 changed files with 87 additions and 26 deletions.
27 changes: 24 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Login to Pinterest using your credentials to obtain browser cookies for scraping
pinterest-dl login [options]
```

![login](/doc/images/pinterest-dl-0.2.0-login.gif)
![login](/doc/images/pinterest-dl-login.gif)

**Options:**
- `-o`, `--output [file]`: File to save browser cookies for future use. (default: `cookies.json`)
Expand All @@ -123,7 +123,7 @@ Extract images from a specified Pinterest URL.
pinterest-dl scrape [url] [output_dir] [options]
```

![scrape](/doc/images/pinterest-dl-0.2.0-scrape.gif)
![scrape](/doc/images/pinterest-dl-scrape.gif)

**Options:**

Expand All @@ -138,14 +138,35 @@ pinterest-dl scrape [url] [output_dir] [options]
- `--incognito`: Activate incognito mode for scraping. (*chrome / firefox only*)
- `--headful`: Run in headful mode with browser window. (*chrome / firefox only*)

#### 3. Download
#### 3. Search
Search for images on Pinterest using a query. (*Experimental, currently only available in API mode*)

**Syntax:**
```bash
pinterest-dl search [query] [output_dir] [options]
```

![search](/doc/images/pinterest-dl-search.gif)

**Options:**
- `-c`, `--cookies [file]`: File containing browser cookies for private boards/pins. Run `login` command to obtain cookies.
- `-l`, `--limit [number]`: Max number of image to download (default: 100).
- `-r`, `--resolution [width]x[height]`: Minimum image resolution for download (e.g., 512x512).
- `--timeout [second]`: Timeout in seconds for requests (default: 3).
- `--json`: Save scraped URLs to a JSON file.
- `--dry-run`: Execute scrape without downloading images.
- `--verbose`: Enable detailed output for debugging.

#### 4. Download
Download images from a list of URLs provided in a file.

**Syntax:**
```bash
pinterest-dl download [url_list] [options]
```

![download](/doc/images/pinterest-dl-download.gif)

**Options:**
- `-o`, `--output [directory]`: Output directory (default: ./<json_filename>).
- `-r`, `--resolution [width]x[height]`: minimum resolution to download (e.g. 512x512).
Expand Down
Binary file added doc/images/pinterest-dl-download.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
File renamed without changes
Binary file added doc/images/pinterest-dl-search.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion pinterest_dl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.3.0"
__version__ = "0.3.1"
__description__ = "An unofficial Pinterest image downloader"

from typing import Literal
Expand Down
2 changes: 1 addition & 1 deletion pinterest_dl/data_model/pinterest_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def prune_local(self, resolution: Tuple[int, int], verbose: bool = False) -> boo
if verbose:
print(f"Local path or size not set for {self.src}")
return False
if self.local_size and self.local_size < resolution:
if self.local_size is not None and resolution is not None and self.local_size < resolution:
self.local_path.unlink()
if verbose:
print(f"Removed {self.local_path}, resolution: {self.local_size} < {resolution}")
Expand Down
14 changes: 7 additions & 7 deletions pinterest_dl/low_level/api/pinterest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def __init__(
self._session = requests.Session()
self._session.cookies.update(self.cookies) # Update session cookies
self._session.headers.update({"User-Agent": self.USER_AGENT})
self._req_builder = RequestBuilder()
self.is_pin = bool(self.pin_id)

def get_related_images(self, num: int, bookmark: List[str]) -> PinResponse:
Expand All @@ -75,7 +74,7 @@ def get_related_images(self, num: int, bookmark: List[str]) -> PinResponse:
"is_pdp": False,
}
try:
request_url = self._req_builder.build_get(endpoint, options, source_url)
request_url = RequestBuilder.build_get(endpoint, options, source_url)
response_raw = self._session.get(request_url, timeout=self.timeout)
except requests.exceptions.RequestException as e:
raise requests.RequestException(f"Failed to request related images: {e}")
Expand Down Expand Up @@ -103,7 +102,7 @@ def get_main_image(self) -> PinResponse:
}

try:
request_url = self._req_builder.build_get(endpoint, options, source_url)
request_url = RequestBuilder.build_get(endpoint, options, source_url)
response_raw = self._session.get(request_url, timeout=self.timeout)
except requests.exceptions.RequestException as e:
raise requests.RequestException(f"Failed to request main image: {e}")
Expand All @@ -126,7 +125,7 @@ def get_board(self) -> PinResponse:
}

try:
request_url = self._req_builder.build_get(endpoint, options, source_url)
request_url = RequestBuilder.build_get(endpoint, options, source_url)
response_raw = self._session.get(request_url, timeout=self.timeout)
except requests.exceptions.RequestException as e:
raise requests.RequestException(f"Failed to request board: {e}")
Expand Down Expand Up @@ -154,7 +153,7 @@ def get_board_feed(self, board_id: str, num: int, bookmark: List[str]) -> PinRes
}

try:
request_url = self._req_builder.build_get(endpoint, options, source_url)
request_url = RequestBuilder.build_get(endpoint, options, source_url)
response_raw = self._session.get(request_url, timeout=self.timeout)
except requests.exceptions.RequestException as e:
raise requests.RequestException(f"Failed to request board feed: {e}")
Expand Down Expand Up @@ -182,7 +181,7 @@ def get_search(self, num: int, bookmark: List[str]) -> PinResponse:
}

try:
request_url = self._req_builder.build_get(endpoint, options, source_url)
request_url = RequestBuilder.build_get(endpoint, options, source_url)
response_raw = self._session.get(request_url, timeout=self.timeout)
except requests.exceptions.RequestException as e:
raise requests.RequestException(f"Failed to request search: {e}")
Expand Down Expand Up @@ -216,7 +215,8 @@ def _parse_search_query(url: str) -> str:
result = re.search(r"/search/pins/\?q=([A-Za-z0-9%]+)&rs=typed", url)
if not result:
raise ValueError(f"Invalid Pinterest search URL: {url}")
return result.group(1)
query = result.group(1)
return RequestBuilder.url_decode(query)

@staticmethod
def _parse_board_url(url: str) -> Tuple[str, str]:
Expand Down
2 changes: 1 addition & 1 deletion pinterest_dl/low_level/ops/bookmark_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def clear(self) -> None:
def get(self) -> List[str]:
if len(self.bookmarks) < self.last:
return self.bookmarks
return self.bookmarks[self.last :]
return self.bookmarks[-self.last :]

def get_all(self) -> List[str]:
return self.bookmarks
18 changes: 10 additions & 8 deletions pinterest_dl/low_level/ops/request_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@ class RequestBuilder:
def __init__(self) -> None:
pass

def build_post(self, options, source_url="/", context=None) -> str:
return self.url_encode(
@staticmethod
def build_post(options, source_url="/", context=None) -> str:
return RequestBuilder.url_encode(
{
"source_url": source_url,
"data": json.dumps({"options": options, "context": context}),
"_": "%s" % int(time.time() * 1000),
}
)

def build_get(
self, endpoint: str, options: dict, source_url: str = "/", context: dict = {}
) -> str:
query = self.url_encode(
@staticmethod
def build_get(endpoint: str, options: dict, source_url: str = "/", context: dict = {}) -> str:
query = RequestBuilder.url_encode(
{
"source_url": source_url,
"data": json.dumps({"options": options, "context": context}),
Expand All @@ -30,14 +30,16 @@ def build_get(
url = f"{endpoint}?{query}"
return url

def url_encode(self, query: str | dict) -> str:
@staticmethod
def url_encode(query: str | dict) -> str:
if isinstance(query, str):
query = quote_plus(query)
else:
query = urlencode(query)
query = query.replace("+", "%20")
return query

def url_decode(self, query: str) -> str:
@staticmethod
def url_decode(query: str) -> str:
# Decode the URL-encoded string
return unquote_plus(query)
42 changes: 37 additions & 5 deletions pinterest_dl/scrapers/scraper_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pinterest_dl.low_level.api.pinterest_api import PinterestAPI
from pinterest_dl.low_level.ops import io
from pinterest_dl.low_level.ops.bookmark_manager import BookmarkManager
from pinterest_dl.low_level.ops.request_builder import RequestBuilder

from .scraper_base import _ScraperBase

Expand Down Expand Up @@ -160,7 +161,7 @@ def search(
print(f"[Batch {batch_count}] bookmarks: {bookmarks.get()}")

time.sleep(delay)
remains = self._handle_missing_images(
remains = self._handle_missing_search_images(
api, batch_size, remains, bookmarks, min_resolution, images, pbar, delay
)
batch_count += 1
Expand Down Expand Up @@ -191,10 +192,15 @@ def search_and_download(
Returns:
Optional[List[PinterestImage]]: List of downloaded PinterestImage objects.
"""
if " " in query:
query = RequestBuilder.url_encode(query)
url = f"https://www.pinterest.com/search/pins/?q={query}&rs=typed"

if self.verbose:
print(f"Scraping URL: {url}")

api = PinterestAPI(url, self.cookies, timeout=self.timeout)
bookmarks = BookmarkManager(2)
bookmarks = BookmarkManager(1)

scraped_imgs = self.search(query, api, limit, min_resolution, 0.2, bookmarks)

Expand Down Expand Up @@ -245,7 +251,7 @@ def _scrape_pins(
if self.verbose:
print(f"bookmarks: {bookmarks.get()}")
time.sleep(delay)
remains = self._handle_missing_images(
remains = self._handle_missing_related_images(
api, batch_size, remains, bookmarks, min_resolution, images, pbar, delay
)

Expand Down Expand Up @@ -285,7 +291,7 @@ def _scrape_board(
break

time.sleep(delay)
remains = self._handle_missing_images(
remains = self._handle_missing_related_images(
api,
batch_size,
remains,
Expand Down Expand Up @@ -339,7 +345,33 @@ def _search_images(
bookmarks.add_all(response.get_bookmarks())
return current_img_batch, bookmarks

def _handle_missing_images(
def _handle_missing_search_images(
self,
api: PinterestAPI,
batch_size: int,
remains: int,
bookmarks: BookmarkManager,
min_resolution: Tuple[int, int],
images: List[PinterestImage],
pbar,
delay: float,
) -> int:
"""Handle cases where a batch does not return enough images."""
difference = batch_size - len(images[-batch_size:])
while difference > 0 and remains > 0:
next_response = api.get_search(difference, bookmarks.get())
next_response_data = next_response.resource_response.get("data", {}).get("results", [])
additional_images = PinterestImage.from_response(next_response_data, min_resolution)
images.extend(additional_images)
bookmarks.add_all(next_response.get_bookmarks())
remains -= len(additional_images)
difference -= len(additional_images)
pbar.update(len(additional_images))
time.sleep(delay)

return remains

def _handle_missing_related_images(
self,
api: PinterestAPI,
batch_size: int,
Expand Down
6 changes: 6 additions & 0 deletions pinterest_dl/scrapers/scraper_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ def add_captions(
for index in tqdm.tqdm(indices_list, desc="Captioning", disable=verbose):
try:
img = images[index]
if not img.local_path:
continue
if img.local_path.suffix == ".gif":
if verbose:
print(f"Skipping captioning for {img.local_path} (GIF)")
continue
if img.origin:
img.write_comment(img.origin)
if verbose:
Expand Down

0 comments on commit 3148e19

Please sign in to comment.