Skip to content

Commit

Permalink
fix: Linting issues
Browse files Browse the repository at this point in the history
  • Loading branch information
iusztinpaul committed Jun 17, 2024
1 parent 18b5155 commit 21e4d54
Show file tree
Hide file tree
Showing 40 changed files with 178 additions and 307 deletions.
8 changes: 2 additions & 6 deletions llm_engineering/application/crawlers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,10 @@ def scroll_page(self) -> None:
current_scroll = 0
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);"
)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height or (
self.scroll_limit and current_scroll >= self.scroll_limit
):
if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
break
last_height = new_height
current_scroll += 1
8 changes: 4 additions & 4 deletions llm_engineering/application/crawlers/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,22 @@ def __init__(self) -> None:
@classmethod
def build(cls) -> "CrawlerDispatcher":
dispatcher = cls()

return dispatcher

def register_medium(self) -> "CrawlerDispatcher":
self.register("medium", MediumCrawler)

return self

def register_linkedin(self) -> "CrawlerDispatcher":
self.register("linkedin", LinkedInCrawler)

return self

def register_github(self) -> "CrawlerDispatcher":
self.register("github", GithubCrawler)

return self

def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
Expand Down
9 changes: 4 additions & 5 deletions llm_engineering/application/crawlers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from llm_engineering.domain.documents import RepositoryDocument

from .base import BaseCrawler
from .base import BaseCrawler


class GithubCrawler(BaseCrawler):
Expand All @@ -29,19 +28,19 @@ def extract(self, link: str, **kwargs) -> None:
os.chdir(local_temp)
subprocess.run(["git", "clone", link])

repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118

tree = {}
for root, dirs, files in os.walk(repo_path):
for root, _, files in os.walk(repo_path):
dir = root.replace(repo_path, "").lstrip("/")
if dir.startswith(self._ignore):
continue

for file in files:
if file.endswith(self._ignore):
continue
file_path = os.path.join(dir, file)
with open(os.path.join(root, file), "r", errors="ignore") as f:
file_path = os.path.join(dir, file) # noqa: PTH118
with open(os.path.join(root, file), "r", errors="ignore") as f: # noqa: PTH123, PTH118
tree[file_path] = f.read().replace(" ", "")

instance = self.model(
Expand Down
10 changes: 5 additions & 5 deletions llm_engineering/application/crawlers/linkedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from loguru import logger
from selenium.webdriver.common.by import By

from .base import BaseAbstractCrawler
from llm_engineering.domain.documents import PostDocument
from llm_engineering.settings import settings
from llm_engineering.domain.exceptions import ImproperlyConfigured
from llm_engineering.settings import settings

from .base import BaseAbstractCrawler


class LinkedInCrawler(BaseAbstractCrawler):
Expand All @@ -25,7 +26,7 @@ def extract(self, link: str, **kwargs):

soup = self._get_page_content(link)

data = {
data = { # noqa: F841
"Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
"About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
"Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
Expand All @@ -36,8 +37,7 @@ def extract(self, link: str, **kwargs):
self.driver.get(link)
time.sleep(5)
button = self.driver.find_element(
By.CSS_SELECTOR,
".app-aware-link.profile-creator-shared-content-view__footer-action"
By.CSS_SELECTOR, ".app-aware-link.profile-creator-shared-content-view__footer-action"
)
button.click()

Expand Down
4 changes: 2 additions & 2 deletions llm_engineering/application/crawlers/medium.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ def extract(self, link: str, **kwargs) -> None:
}

self.driver.close()

instance = self.model(
platform="medium",
content=data,
link=link,
author_id=kwargs["user"].id,
)
instance.save()

logger.info(f"Successfully scraped and saved article: {link}")

def login(self):
Expand Down
1 change: 0 additions & 1 deletion llm_engineering/application/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from . import generation


__all__ = ["generation"]
35 changes: 11 additions & 24 deletions llm_engineering/application/dataset/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,12 @@ def get_system_prompt(cls) -> Prompt:
)

@classmethod
def get_prompts(
cls, documents: list[CleanedDocument]
) -> dict[DataCategory, list[GenerateDatasetSamplesPrompt]]:
def get_prompts(cls, documents: list[CleanedDocument]) -> dict[DataCategory, list[GenerateDatasetSamplesPrompt]]:
grouped_prompts = {}
grouped_cleaned_documents = CleanedDocument.group_by_category(documents)
for category, documents in grouped_cleaned_documents.items():
batched_documents_generator = cls._batch_by_category(category, documents)
category_prompts = [
cls.get_prompt(batch) for batch in batched_documents_generator
]
category_prompts = [cls.get_prompt(batch) for batch in batched_documents_generator]
grouped_prompts[category] = category_prompts

return grouped_prompts
Expand All @@ -73,9 +69,7 @@ def _batch_by_category(
raise ValueError(f"Unsupported category: {category}")

@classmethod
def get_prompt(
cls, documents: list[CleanedDocument]
) -> GenerateDatasetSamplesPrompt:
def get_prompt(cls, documents: list[CleanedDocument]) -> GenerateDatasetSamplesPrompt:
assert len(documents) > 0, "At least one document is required"

data_category = documents[0].get_category()
Expand All @@ -90,9 +84,7 @@ def get_prompt(
input_variables = {
"data_category": data_category,
"len_documents": len(documents),
"documents": [
{"index": i, "content": doc.content} for i, doc in enumerate(documents)
],
"documents": [{"index": i, "content": doc.content} for i, doc in enumerate(documents)],
}
prompt = prompt_template.format(**input_variables)
prompt_tokens = cls.tokenizer.encode(prompt)
Expand Down Expand Up @@ -126,42 +118,37 @@ def _batch_to_langchain_prompt(
]

return messages

if mock:
llm = FakeListLLM(
responses=[
'```json\n[{"instruction": "mock instruction"}, {"instruction": "mock instruction"}, {"instruction": "mock instruction"}]\n```' # noqa
'```json\n[{"instruction": "mock instruction"}, {"instruction": "mock instruction"}, {"instruction": "mock instruction"}]\n```'
]
)
else:
llm = ChatOpenAI(model=settings.OPENAI_MODEL_ID, temperature=0)
parser = ListPydanticOutputParser(
pydantic_object=domain.dataset.InstructDatasetSample
)
parser = ListPydanticOutputParser(pydantic_object=domain.dataset.InstructDatasetSample)

chain = llm | parser

datasets = {}
for category, category_prompts in prompts.items():
langchain_category_prompts = [
_batch_to_langchain_prompt(batch) for batch in category_prompts
]
langchain_category_prompts = [_batch_to_langchain_prompt(batch) for batch in category_prompts]
batched_instruct_dataset_samples = chain.batch(langchain_category_prompts)

flattened_instruct_dataset_samples = []
for prompt, per_prompt_instruct_dataset_samples in zip(
category_prompts, batched_instruct_dataset_samples
category_prompts, batched_instruct_dataset_samples, strict=False
):
prompt_documents_as_response = prompt.documents
for document_as_response, instruct_dataset_sample in zip(
prompt_documents_as_response, per_prompt_instruct_dataset_samples
prompt_documents_as_response, per_prompt_instruct_dataset_samples, strict=False
):
instruct_dataset_sample.response = document_as_response.content

flattened_instruct_dataset_samples.append(instruct_dataset_sample)

dataset = domain.dataset.InstructDataset(
category=category, samples=flattened_instruct_dataset_samples
)
dataset = domain.dataset.InstructDataset(category=category, samples=flattened_instruct_dataset_samples)
datasets[category] = dataset

return datasets
1 change: 1 addition & 0 deletions llm_engineering/application/dataset/output_parsers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from langchain.output_parsers import PydanticOutputParser


class ListPydanticOutputParser(PydanticOutputParser):
def _parse_obj(self, obj: dict | list):
if isinstance(obj, list):
Expand Down
2 changes: 1 addition & 1 deletion llm_engineering/application/networks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .embeddings import EmbeddingModelSingleton, CrossEncoderModelSingleton
from .embeddings import CrossEncoderModelSingleton, EmbeddingModelSingleton

__all__ = ["EmbeddingModelSingleton", "CrossEncoderModelSingleton"]
3 changes: 2 additions & 1 deletion llm_engineering/application/networks/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from threading import Lock
from typing import ClassVar


class SingletonMeta(type):
"""
This is a thread-safe implementation of Singleton.
"""

_instances = {}
_instances: ClassVar = {}

_lock: Lock = Lock()

Expand Down
24 changes: 9 additions & 15 deletions llm_engineering/application/networks/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import numpy as np
from loguru import logger
from numpy.typing import NDArray
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.SentenceTransformer import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder
from transformers import AutoTokenizer

from llm_engineering.settings import settings
Expand Down Expand Up @@ -54,7 +54,7 @@ def embedding_size(self) -> int:
Returns:
int: The size of the embeddings generated by the pre-trained transformer model.
"""

dummy_embedding = self._model.encode("")

return dummy_embedding.shape[0]
Expand All @@ -81,9 +81,7 @@ def tokenizer(self) -> AutoTokenizer:

return self._model.tokenizer

def __call__(
self, input_text: str, to_list: bool = True
) -> NDArray[np.float32] | list[float]:
def __call__(self, input_text: str, to_list: bool = True) -> NDArray[np.float32] | list[float]:
"""
Generates embeddings for the input text using the pre-trained transformer model.
Expand All @@ -98,9 +96,7 @@ def __call__(
try:
embeddings = self._model.encode(input_text)
except Exception:
logger.error(
f"Error generating embeddings for {self._model_id=} and {input_text=}"
)
logger.error(f"Error generating embeddings for {self._model_id=} and {input_text=}")

return [] if to_list else np.array([])

Expand All @@ -124,14 +120,12 @@ def __init__(
self._device = device

self._model = CrossEncoder(
model_name=self._model_id,
device=self._device,
)
model_name=self._model_id,
device=self._device,
)
self._model.model.eval()

def __call__(
self, pairs: list[tuple[str, str]], to_list: bool = True
) -> NDArray[np.float32] | list[float]:

def __call__(self, pairs: list[tuple[str, str]], to_list: bool = True) -> NDArray[np.float32] | list[float]:
scores = self._model.predict(pairs)

if to_list:
Expand Down
2 changes: 1 addition & 1 deletion llm_engineering/application/preprocessing/dispatchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
RepositoryCleaningHandler,
)
from .embedding_data_handlers import (
QueryEmbeddingHandler,
ArticleEmbeddingHandler,
EmbeddingDataHandler,
PostEmbeddingHandler,
QueryEmbeddingHandler,
RepositoryEmbeddingHandler,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
EmbeddedPostChunk,
EmbeddedRepositoryChunk,
)
from llm_engineering.domain.queries import Query, EmbeddedQuery
from llm_engineering.domain.queries import EmbeddedQuery, Query

from .operations import embedd_text

Expand All @@ -30,7 +30,7 @@ class EmbeddingDataHandler(ABC, Generic[ChunkT, EmbeddedChunkT]):
@abstractmethod
def embed(self, data_model: ChunkT) -> EmbeddedChunkT:
pass


class QueryEmbeddingHandler(EmbeddingDataHandler):
def embed(self, data_model: Query) -> EmbeddedQuery:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@

embedding_model = EmbeddingModelSingleton()


def chunk_text(text: str) -> list[str]:
character_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n"], chunk_size=500, chunk_overlap=0
)
character_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=500, chunk_overlap=0)
text_split = character_splitter.split_text(text)

token_splitter = SentenceTransformersTokenTextSplitter(
Expand Down
26 changes: 12 additions & 14 deletions llm_engineering/application/preprocessing/operations/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@

def unbold_text(text):
# Mapping of bold numbers to their regular equivalents
bold_numbers = {
"𝟬": "0",
"𝟭": "1",
"𝟮": "2",
"𝟯": "3",
"𝟰": "4",
"𝟱": "5",
"𝟲": "6",
"𝟳": "7",
"𝟴": "8",
"𝟵": "9",
bold_numbers = {
"𝟬": "0", # noqa: RUF001
"𝟭": "1", # noqa: RUF001
"𝟮": "2", # noqa: RUF001
"𝟯": "3", # noqa: RUF001
"𝟰": "4", # noqa: RUF001
"𝟱": "5", # noqa: RUF001
"𝟲": "6", # noqa: RUF001
"𝟳": "7", # noqa: RUF001
"𝟴": "8", # noqa: RUF001
"𝟵": "9", # noqa: RUF001
}

# Function to convert bold characters (letters and numbers)
Expand All @@ -38,9 +38,7 @@ def convert_bold_char(match):
return char # Return the character unchanged if it's not a bold number or letter

# Regex for bold characters (numbers, uppercase, and lowercase letters)
bold_pattern = re.compile(
r"[\U0001D5D4-\U0001D5ED\U0001D5EE-\U0001D607\U0001D7CE-\U0001D7FF]"
)
bold_pattern = re.compile(r"[\U0001D5D4-\U0001D5ED\U0001D5EE-\U0001D607\U0001D7CE-\U0001D7FF]")
text = bold_pattern.sub(convert_bold_char, text)

return text
Expand Down
Loading

0 comments on commit 21e4d54

Please sign in to comment.