Skip to content

Commit

Permalink
Add Unit Tests for Image Processing + Page Number Tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
BenConstable9 authored Feb 19, 2025
1 parent 064d406 commit c17067a
Show file tree
Hide file tree
Showing 22 changed files with 2,160 additions and 174 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/ci-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,29 @@ jobs:

- name: Run pre-commit
run: uv run pre-commit run --all-files

job-image-processing-unit-tests:
name: Image Processing Unit Tests
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: ${{ env.MIN_PYTHON_VERSION }}

- name: Install uv
uses: astral-sh/setup-uv@v4
with:
enable-cache: true

- name: Install the project
run: uv sync
working-directory: image_processing

- name: Run PyTest
run: uv run pytest --cov=. --cov-config=.coveragerc
working-directory: image_processing
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ repos:

# Python checks
- id: name-tests-test
args: [--pytest-test-first]

# JSON files
- id: pretty-format-json
Expand Down
36 changes: 28 additions & 8 deletions deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,11 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
mark_up_cleaner_context = "/document/page_wise_layout/*"
inputs = [
InputFieldMappingEntry(
name="chunk", source="/document/page_wise_layout/*/merged_content"
name="mark_up", source="/document/page_wise_layout/*/merged_content"
),
InputFieldMappingEntry(
name="page_number",
source="/document/page_wise_layout/*/page_number",
),
InputFieldMappingEntry(
name="figures",
Expand All @@ -230,20 +234,26 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
mark_up_cleaner_context = "/document/chunk_mark_ups/*"
inputs = [
InputFieldMappingEntry(
name="chunk", source="/document/chunk_mark_ups/*"
name="mark_up", source="/document/chunk_mark_ups/*/mark_up"
),
InputFieldMappingEntry(
name="page_number", source="/document/chunk_mark_ups/*/page_number"
),
InputFieldMappingEntry(
name="figures", source="/document/layout/figures/*/updated_figure"
),
]

mark_up_cleaner_skill_outputs = [
OutputFieldMappingEntry(name="chunk_cleaned", target_name="chunk_cleaned"),
OutputFieldMappingEntry(
name="chunk_sections", target_name="chunk_sections"
name="cleaned_text", target_name="final_cleaned_text"
),
OutputFieldMappingEntry(name="sections", target_name="final_sections"),
OutputFieldMappingEntry(name="mark_up", target_name="final_mark_up"),
OutputFieldMappingEntry(name="figures", target_name="final_chunk_figures"),
OutputFieldMappingEntry(
name="page_number", target_name="final_page_number"
),
OutputFieldMappingEntry(name="chunk_mark_up", target_name="chunk_mark_up"),
OutputFieldMappingEntry(name="chunk_figures", target_name="chunk_figures"),
]

mark_up_cleaner_skill = WebApiSkill(
Expand Down Expand Up @@ -302,7 +312,11 @@ def get_semantic_chunker_skill(
semantic_text_chunker_skill_inputs = [
InputFieldMappingEntry(
name="content", source="/document/layout_merged_content"
)
),
InputFieldMappingEntry(
name="per_page_starting_sentences",
source="/document/per_page_starting_sentences",
),
]

semantic_text_chunker_skill_outputs = [
Expand Down Expand Up @@ -368,7 +382,13 @@ def get_layout_analysis_skill(
)
]
else:
output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
output = [
OutputFieldMappingEntry(name="layout", target_name="layout"),
OutputFieldMappingEntry(
name="per_page_starting_sentences",
target_name="per_page_starting_sentences",
),
]

layout_analysis_skill = WebApiSkill(
name="Layout Analysis Skill",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ def get_index_fields(self) -> list[SearchableField]:
type=SearchFieldDataType.String,
collection=True,
),
SimpleField(
name="PageNumber",
type=SearchFieldDataType.Int64,
sortable=True,
filterable=True,
facetable=True,
),
SearchField(
name="ChunkEmbedding",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
Expand Down Expand Up @@ -137,19 +144,6 @@ def get_index_fields(self) -> list[SearchableField]:
),
]

if self.enable_page_by_chunking:
fields.extend(
[
SimpleField(
name="PageNumber",
type=SearchFieldDataType.Int64,
sortable=True,
filterable=True,
facetable=True,
)
]
)

return fields

def get_semantic_search(self) -> SemanticSearch:
Expand Down Expand Up @@ -194,11 +188,12 @@ def get_skills(self) -> list:
if self.enable_page_by_chunking:
embedding_skill = self.get_vector_skill(
"/document/page_wise_layout/*",
"/document/page_wise_layout/*/chunk_cleaned",
"/document/page_wise_layout/*/final_cleaned_text",
)
else:
embedding_skill = self.get_vector_skill(
"/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/chunk_cleaned"
"/document/chunk_mark_ups/*",
"/document/chunk_mark_ups/*/final_cleaned_text",
)

if self.enable_page_by_chunking:
Expand Down Expand Up @@ -229,7 +224,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
source_context = "/document/page_wise_layout/*"
mappings = [
InputFieldMappingEntry(
name="Chunk", source="/document/page_wise_layout/*/chunk_mark_up"
name="Chunk", source="/document/page_wise_layout/*/final_mark_up"
),
InputFieldMappingEntry(
name="ChunkEmbedding",
Expand All @@ -239,24 +234,25 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
InputFieldMappingEntry(
name="Sections",
source="/document/page_wise_layout/*/chunk_sections",
source="/document/page_wise_layout/*/final_sections",
),
InputFieldMappingEntry(
name="ChunkFigures",
source="/document/page_wise_layout/*/chunk_figures/*",
source="/document/page_wise_layout/*/final_chunk_figures/*",
),
InputFieldMappingEntry(
name="DateLastModified", source="/document/DateLastModified"
),
InputFieldMappingEntry(
name="PageNumber", source="/document/page_wise_layout/*/page_number"
name="PageNumber",
source="/document/page_wise_layout/*/final_page_number",
),
]
else:
source_context = "/document/chunk_mark_ups/*"
mappings = [
InputFieldMappingEntry(
name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
name="Chunk", source="/document/chunk_mark_ups/*/final_mark_up"
),
InputFieldMappingEntry(
name="ChunkEmbedding",
Expand All @@ -265,15 +261,19 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
InputFieldMappingEntry(name="Title", source="/document/Title"),
InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
InputFieldMappingEntry(
name="Sections", source="/document/chunk_mark_ups/*/chunk_sections"
name="Sections", source="/document/chunk_mark_ups/*/final_sections"
),
InputFieldMappingEntry(
name="ChunkFigures",
source="/document/chunk_mark_ups/*/chunk_figures/*",
source="/document/chunk_mark_ups/*/final_chunk_figures/*",
),
InputFieldMappingEntry(
name="DateLastModified", source="/document/DateLastModified"
),
InputFieldMappingEntry(
name="PageNumber",
source="/document/chunk_mark_ups/*/final_page_number",
),
]

index_projections = SearchIndexerIndexProjection(
Expand Down
11 changes: 11 additions & 0 deletions image_processing/.coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[run]
omit =
tests/*
*/__init__.py

[report]
omit =
tests/*
*/__init__.py
exclude_lines =
if __name__ == "__main__":
5 changes: 5 additions & 0 deletions image_processing/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,9 @@ dev = [
"pygments>=2.18.0",
"ruff>=0.8.1",
"python-dotenv>=1.0.1",
"coverage>=7.6.12",
"pytest>=8.3.4",
"pytest-asyncio>=0.25.3",
"pytest-cov>=6.0.0",
"pytest-mock>=3.14.0",
]
2 changes: 2 additions & 0 deletions image_processing/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
pythonpath = src/image_processing
Empty file.
42 changes: 41 additions & 1 deletion image_processing/src/image_processing/layout_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
LayoutHolder,
PageWiseContentHolder,
NonPageWiseContentHolder,
PerPageStartingSentenceHolder,
)


Expand Down Expand Up @@ -340,6 +341,40 @@ def create_page_wise_content(self) -> list[LayoutHolder]:

return page_wise_contents

def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
"""Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
Returns:
--------
list: A list of the starting sentence of each page."""

per_page_starting_sentences = []

for page in self.result.pages:
page_content = self.result.content[
page.spans[0]["offset"] : page.spans[0]["offset"]
+ page.spans[0]["length"]
]

# Remove any leading whitespace/newlines.
cleaned_content = page_content.lstrip()
# If a newline appears before a period, split on newline; otherwise, on period.
if "\n" in cleaned_content:
first_line = cleaned_content.split("\n", 1)[0]
elif "." in cleaned_content:
first_line = cleaned_content.split(".", 1)[0]
else:
first_line = cleaned_content

per_page_starting_sentences.append(
PerPageStartingSentenceHolder(
page_number=page.page_number,
starting_sentence=first_line.strip(),
)
)

return per_page_starting_sentences

async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
"""Get the Azure Document Intelligence client.
Expand Down Expand Up @@ -487,7 +522,12 @@ async def analyse(self):
if self.extract_figures:
await self.process_figures_from_extracted_content(text_content)

output_record = NonPageWiseContentHolder(layout=text_content)
per_page_starting_sentences = self.create_per_page_starting_sentence()

output_record = NonPageWiseContentHolder(
layout=text_content,
per_page_starting_sentences=per_page_starting_sentences,
)

except Exception as e:
logging.error(e)
Expand Down
22 changes: 21 additions & 1 deletion image_processing/src/image_processing/layout_holders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@


class FigureHolder(BaseModel):

"""A class to hold the figure extracted from the document."""

figure_id: str = Field(..., alias="FigureId")
Expand Down Expand Up @@ -48,7 +47,28 @@ class PageWiseContentHolder(BaseModel):
page_wise_layout: list[LayoutHolder]


class PerPageStartingSentenceHolder(BaseModel):
"""A class to hold the starting sentence of each page."""

page_number: int
starting_sentence: str


class NonPageWiseContentHolder(BaseModel):
"""A class to hold the non-page-wise content extracted from the document."""

layout: LayoutHolder
per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
default_factory=list
)


class ChunkHolder(BaseModel):
"""A class to hold the text extracted from the document after it has been chunked."""

mark_up: str
sections: Optional[list[str]] = Field(default_factory=list)
figures: Optional[list[FigureHolder]] = Field(default_factory=list)
starting_sentence: Optional[str] = None
cleaned_text: Optional[str] = None
page_number: Optional[int] = Field(default=None)
Loading

0 comments on commit c17067a

Please sign in to comment.