Add Unit Tests for Image Processing + Page Number Tracking

microsoft · Feb 19, 2025 · c17067a · c17067a
1 parent 064d406
commit c17067a
Show file tree

Hide file tree

Showing 22 changed files with 2,160 additions and 174 deletions.
diff --git a/.github/workflows/ci-checks.yaml b/.github/workflows/ci-checks.yaml
@@ -36,3 +36,29 @@ jobs:
 
       - name: Run pre-commit
         run: uv run pre-commit run --all-files
+
+  job-image-processing-unit-tests:
+    name: Image Processing Unit Tests
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ env.MIN_PYTHON_VERSION }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync
+        working-directory: image_processing
+
+      - name: Run PyTest
+        run: uv run pytest --cov=. --cov-config=.coveragerc
+        working-directory: image_processing
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,6 +18,7 @@ repos:
 
       # Python checks
       - id: name-tests-test
+        args: [--pytest-test-first]
 
       # JSON files
       - id: pretty-format-json

diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py
@@ -219,7 +219,11 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
             mark_up_cleaner_context = "/document/page_wise_layout/*"
             inputs = [
                 InputFieldMappingEntry(
-                    name="chunk", source="/document/page_wise_layout/*/merged_content"
+                    name="mark_up", source="/document/page_wise_layout/*/merged_content"
+                ),
+                InputFieldMappingEntry(
+                    name="page_number",
+                    source="/document/page_wise_layout/*/page_number",
                 ),
                 InputFieldMappingEntry(
                     name="figures",
@@ -230,20 +234,26 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill:
             mark_up_cleaner_context = "/document/chunk_mark_ups/*"
             inputs = [
                 InputFieldMappingEntry(
-                    name="chunk", source="/document/chunk_mark_ups/*"
+                    name="mark_up", source="/document/chunk_mark_ups/*/mark_up"
+                ),
+                InputFieldMappingEntry(
+                    name="page_number", source="/document/chunk_mark_ups/*/page_number"
                 ),
                 InputFieldMappingEntry(
                     name="figures", source="/document/layout/figures/*/updated_figure"
                 ),
             ]
 
         mark_up_cleaner_skill_outputs = [
-            OutputFieldMappingEntry(name="chunk_cleaned", target_name="chunk_cleaned"),
             OutputFieldMappingEntry(
-                name="chunk_sections", target_name="chunk_sections"
+                name="cleaned_text", target_name="final_cleaned_text"
+            ),
+            OutputFieldMappingEntry(name="sections", target_name="final_sections"),
+            OutputFieldMappingEntry(name="mark_up", target_name="final_mark_up"),
+            OutputFieldMappingEntry(name="figures", target_name="final_chunk_figures"),
+            OutputFieldMappingEntry(
+                name="page_number", target_name="final_page_number"
             ),
-            OutputFieldMappingEntry(name="chunk_mark_up", target_name="chunk_mark_up"),
-            OutputFieldMappingEntry(name="chunk_figures", target_name="chunk_figures"),
         ]
 
         mark_up_cleaner_skill = WebApiSkill(
@@ -302,7 +312,11 @@ def get_semantic_chunker_skill(
         semantic_text_chunker_skill_inputs = [
             InputFieldMappingEntry(
                 name="content", source="/document/layout_merged_content"
-            )
+            ),
+            InputFieldMappingEntry(
+                name="per_page_starting_sentences",
+                source="/document/per_page_starting_sentences",
+            ),
         ]
 
         semantic_text_chunker_skill_outputs = [
@@ -368,7 +382,13 @@ def get_layout_analysis_skill(
                 )
             ]
         else:
-            output = [OutputFieldMappingEntry(name="layout", target_name="layout")]
+            output = [
+                OutputFieldMappingEntry(name="layout", target_name="layout"),
+                OutputFieldMappingEntry(
+                    name="per_page_starting_sentences",
+                    target_name="per_page_starting_sentences",
+                ),
+            ]
 
         layout_analysis_skill = WebApiSkill(
             name="Layout Analysis Skill",

diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py
@@ -81,6 +81,13 @@ def get_index_fields(self) -> list[SearchableField]:
                 type=SearchFieldDataType.String,
                 collection=True,
             ),
+            SimpleField(
+                name="PageNumber",
+                type=SearchFieldDataType.Int64,
+                sortable=True,
+                filterable=True,
+                facetable=True,
+            ),
             SearchField(
                 name="ChunkEmbedding",
                 type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
@@ -137,19 +144,6 @@ def get_index_fields(self) -> list[SearchableField]:
             ),
         ]
 
-        if self.enable_page_by_chunking:
-            fields.extend(
-                [
-                    SimpleField(
-                        name="PageNumber",
-                        type=SearchFieldDataType.Int64,
-                        sortable=True,
-                        filterable=True,
-                        facetable=True,
-                    )
-                ]
-            )
-
         return fields
 
     def get_semantic_search(self) -> SemanticSearch:
@@ -194,11 +188,12 @@ def get_skills(self) -> list:
         if self.enable_page_by_chunking:
             embedding_skill = self.get_vector_skill(
                 "/document/page_wise_layout/*",
-                "/document/page_wise_layout/*/chunk_cleaned",
+                "/document/page_wise_layout/*/final_cleaned_text",
             )
         else:
             embedding_skill = self.get_vector_skill(
-                "/document/chunk_mark_ups/*", "/document/chunk_mark_ups/*/chunk_cleaned"
+                "/document/chunk_mark_ups/*",
+                "/document/chunk_mark_ups/*/final_cleaned_text",
             )
 
         if self.enable_page_by_chunking:
@@ -229,7 +224,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
             source_context = "/document/page_wise_layout/*"
             mappings = [
                 InputFieldMappingEntry(
-                    name="Chunk", source="/document/page_wise_layout/*/chunk_mark_up"
+                    name="Chunk", source="/document/page_wise_layout/*/final_mark_up"
                 ),
                 InputFieldMappingEntry(
                     name="ChunkEmbedding",
@@ -239,24 +234,25 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
                 InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
                 InputFieldMappingEntry(
                     name="Sections",
-                    source="/document/page_wise_layout/*/chunk_sections",
+                    source="/document/page_wise_layout/*/final_sections",
                 ),
                 InputFieldMappingEntry(
                     name="ChunkFigures",
-                    source="/document/page_wise_layout/*/chunk_figures/*",
+                    source="/document/page_wise_layout/*/final_chunk_figures/*",
                 ),
                 InputFieldMappingEntry(
                     name="DateLastModified", source="/document/DateLastModified"
                 ),
                 InputFieldMappingEntry(
-                    name="PageNumber", source="/document/page_wise_layout/*/page_number"
+                    name="PageNumber",
+                    source="/document/page_wise_layout/*/final_page_number",
                 ),
             ]
         else:
             source_context = "/document/chunk_mark_ups/*"
             mappings = [
                 InputFieldMappingEntry(
-                    name="Chunk", source="/document/chunk_mark_ups/*/chunk_mark_up"
+                    name="Chunk", source="/document/chunk_mark_ups/*/final_mark_up"
                 ),
                 InputFieldMappingEntry(
                     name="ChunkEmbedding",
@@ -265,15 +261,19 @@ def get_index_projections(self) -> SearchIndexerIndexProjection:
                 InputFieldMappingEntry(name="Title", source="/document/Title"),
                 InputFieldMappingEntry(name="SourceUri", source="/document/SourceUri"),
                 InputFieldMappingEntry(
-                    name="Sections", source="/document/chunk_mark_ups/*/chunk_sections"
+                    name="Sections", source="/document/chunk_mark_ups/*/final_sections"
                 ),
                 InputFieldMappingEntry(
                     name="ChunkFigures",
-                    source="/document/chunk_mark_ups/*/chunk_figures/*",
+                    source="/document/chunk_mark_ups/*/final_chunk_figures/*",
                 ),
                 InputFieldMappingEntry(
                     name="DateLastModified", source="/document/DateLastModified"
                 ),
+                InputFieldMappingEntry(
+                    name="PageNumber",
+                    source="/document/chunk_mark_ups/*/final_page_number",
+                ),
             ]
 
         index_projections = SearchIndexerIndexProjection(

diff --git a/image_processing/.coveragerc b/image_processing/.coveragerc
@@ -0,0 +1,11 @@
+[run]
+omit =
+    tests/*
+    */__init__.py
+
+[report]
+omit =
+    tests/*
+    */__init__.py
+exclude_lines =
+    if __name__ == "__main__":
diff --git a/image_processing/pyproject.toml b/image_processing/pyproject.toml
@@ -43,4 +43,9 @@ dev = [
     "pygments>=2.18.0",
     "ruff>=0.8.1",
     "python-dotenv>=1.0.1",
+    "coverage>=7.6.12",
+    "pytest>=8.3.4",
+    "pytest-asyncio>=0.25.3",
+    "pytest-cov>=6.0.0",
+    "pytest-mock>=3.14.0",
 ]
diff --git a/image_processing/pytest.ini b/image_processing/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = src/image_processing
diff --git a/image_processing/src/image_processing/__init__.py b/image_processing/src/image_processing/__init__.py
diff --git a/image_processing/src/image_processing/layout_analysis.py b/image_processing/src/image_processing/layout_analysis.py
@@ -22,6 +22,7 @@
     LayoutHolder,
     PageWiseContentHolder,
     NonPageWiseContentHolder,
+    PerPageStartingSentenceHolder,
 )
 
 
@@ -340,6 +341,40 @@ def create_page_wise_content(self) -> list[LayoutHolder]:
 
         return page_wise_contents
 
+    def create_per_page_starting_sentence(self) -> list[PerPageStartingSentenceHolder]:
+        """Create a list of the starting sentence of each page so we can assign the starting sentence to the page number.
+
+        Returns:
+        --------
+            list: A list of the starting sentence of each page."""
+
+        per_page_starting_sentences = []
+
+        for page in self.result.pages:
+            page_content = self.result.content[
+                page.spans[0]["offset"] : page.spans[0]["offset"]
+                + page.spans[0]["length"]
+            ]
+
+            # Remove any leading whitespace/newlines.
+            cleaned_content = page_content.lstrip()
+            # If a newline appears before a period, split on newline; otherwise, on period.
+            if "\n" in cleaned_content:
+                first_line = cleaned_content.split("\n", 1)[0]
+            elif "." in cleaned_content:
+                first_line = cleaned_content.split(".", 1)[0]
+            else:
+                first_line = cleaned_content
+
+            per_page_starting_sentences.append(
+                PerPageStartingSentenceHolder(
+                    page_number=page.page_number,
+                    starting_sentence=first_line.strip(),
+                )
+            )
+
+        return per_page_starting_sentences
+
     async def get_document_intelligence_client(self) -> DocumentIntelligenceClient:
         """Get the Azure Document Intelligence client.
 
@@ -487,7 +522,12 @@ async def analyse(self):
                 if self.extract_figures:
                     await self.process_figures_from_extracted_content(text_content)
 
-                output_record = NonPageWiseContentHolder(layout=text_content)
+                per_page_starting_sentences = self.create_per_page_starting_sentence()
+
+                output_record = NonPageWiseContentHolder(
+                    layout=text_content,
+                    per_page_starting_sentences=per_page_starting_sentences,
+                )
 
         except Exception as e:
             logging.error(e)

diff --git a/image_processing/src/image_processing/layout_holders.py b/image_processing/src/image_processing/layout_holders.py
@@ -6,7 +6,6 @@
 
 
 class FigureHolder(BaseModel):
-
     """A class to hold the figure extracted from the document."""
 
     figure_id: str = Field(..., alias="FigureId")
@@ -48,7 +47,28 @@ class PageWiseContentHolder(BaseModel):
     page_wise_layout: list[LayoutHolder]
 
 
+class PerPageStartingSentenceHolder(BaseModel):
+    """A class to hold the starting sentence of each page."""
+
+    page_number: int
+    starting_sentence: str
+
+
 class NonPageWiseContentHolder(BaseModel):
     """A class to hold the non-page-wise content extracted from the document."""
 
     layout: LayoutHolder
+    per_page_starting_sentences: list[PerPageStartingSentenceHolder] = Field(
+        default_factory=list
+    )
+
+
+class ChunkHolder(BaseModel):
+    """A class to hold the text extracted from the document after it has been chunked."""
+
+    mark_up: str
+    sections: Optional[list[str]] = Field(default_factory=list)
+    figures: Optional[list[FigureHolder]] = Field(default_factory=list)
+    starting_sentence: Optional[str] = None
+    cleaned_text: Optional[str] = None
+    page_number: Optional[int] = Field(default=None)