Add unit tests for text extraction functions (#7)

* Add Unit Tests for Text Extraction Functions * Reduce file size for unicode test resources, add pytest-cov config, add gh workflow for automatic unit tests * Fix test resource file loading to use context manager for safer handling
Vets-Who-Code · Oct 24, 2024 · e4aaafd · e4aaafd
1 parent 164d92c
commit e4aaafd
Show file tree

Hide file tree

Showing 20 changed files with 113 additions and 7 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,4 @@
+[run]
+omit =
+    tests/*
+    */tests/*
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
@@ -0,0 +1,23 @@
+name: Run Unit Tests with Pytest
+
+on: [ push, pull_request ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run tests with coverage report
+        run: |
+          pytest --cov --cov-report=term-missing
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,8 @@ __pycache__/
 # C extensions
 *.so
 
+.DS_Store
+
 # Distribution / packaging
 .Python
 build/

diff --git a/data/.DS_Store b/data/.DS_Store
diff --git a/data/employment_transitions/.DS_Store b/data/employment_transitions/.DS_Store
diff --git a/data/employment_transitions/job_codes/.DS_Store b/data/employment_transitions/job_codes/.DS_Store
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,5 @@ PyPDF2
 python-docx
 python-dotenv
 openai
+pytest
+pytest-cov
diff --git a/streamlit_app.py b/streamlit_app.py
@@ -21,14 +21,17 @@
 # Get the API key from environment variables
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
+
 # Function to inject custom CSS for light mode
 def inject_custom_css():
     st.markdown("""
         <!-- Your existing CSS styles -->
     """, unsafe_allow_html=True)
 
+
 inject_custom_css()
 
+
 # Function to read and extract text from PDFs
 def extract_text_from_pdf(file):
     try:
@@ -47,6 +50,7 @@ def extract_text_from_pdf(file):
         raise TypeError("Invalid PDF file.")
     return text
 
+
 # Function to read and extract text from Word documents
 def extract_text_from_word(file):
     try:
@@ -57,16 +61,19 @@ def extract_text_from_word(file):
     except BadZipfile as e:
         raise TypeError("Invalid docx file. File may be password protected or corrupted.")
 
+
 # Function to load military job codes from the directories (TXT format)
 def load_military_job_codes(base_path):
     # Your existing implementation
     pass
 
+
 # Function to translate military job code to civilian job suggestions
 def translate_job_code(job_code, job_codes):
     # Your existing implementation
     pass
 
+
 # Fetch response from OpenAI using the API key with increased timeout
 def fetch_from_model(conversation):
     """Send a request to OpenAI using the conversation history."""
@@ -99,15 +106,16 @@ def fetch_from_model(conversation):
     except httpx.RequestError as e:
         st.error(f"An error occurred while making a request to OpenAI: {e}")
         return "Error communicating with the OpenAI API."
-    
+
     except Exception as e:
         st.error(f"An unexpected error occurred: {e}")
         return "Unexpected error while fetching response."
 
+
 # Callback to process user input and clear it afterward
 def process_input(job_codes):
     user_input = st.session_state["temp_input"]
-    
+
     if user_input:
         # Store user input into chat history
         st.session_state.messages.append({"role": "user", "content": user_input})
@@ -117,7 +125,8 @@ def process_input(job_codes):
 
         # Include document content in the system prompt if available
         if "document_content" in st.session_state and st.session_state["document_content"]:
-            conversation[0]["content"] += f" The user has provided the following document content to assist you: {st.session_state['document_content']}"
+            conversation[0][
+                "content"] += f" The user has provided the following document content to assist you: {st.session_state['document_content']}"
 
         # Append previous messages, being mindful of token limits
         for msg in st.session_state.messages[-10:]:  # Adjust the number of messages as needed
@@ -132,10 +141,11 @@ def process_input(job_codes):
     # Clear the temporary input
     st.session_state["temp_input"] = ""
 
+
 # Handle user input and job code translation along with resume upload
 def handle_user_input(job_codes):
     """Handle user input for translating military job codes to civilian jobs, uploading resumes, and chatting."""
-    
+
     # Display chat messages first
     display_chat_messages()
 
@@ -156,6 +166,8 @@ def handle_user_input(job_codes):
             if uploaded_file.size > 20 * 1024 * 1024:
                 raise ValueError("File size is too large. Uploaded files must be less than 20 MB.")
 
+            print(type(uploaded_file))
+
             file_text = ""
 
             if uploaded_file.type == "application/pdf":
@@ -171,11 +183,12 @@ def handle_user_input(job_codes):
             st.error(e)
 
     # Input field for user queries (job code or general chat) at the bottom
-    st.text_input("Enter your military job code (e.g., 11B, AFSC, MOS) or ask a question:", 
-                  key="temp_input", 
-                  on_change=process_input, 
+    st.text_input("Enter your military job code (e.g., 11B, AFSC, MOS) or ask a question:",
+                  key="temp_input",
+                  on_change=process_input,
                   args=(job_codes,))
 
+
 # Display the app title and description
 def display_title_and_description():
     """Display the app title and description."""
@@ -185,6 +198,7 @@ def display_title_and_description():
         "to help veterans navigate employment transitions and find opportunities in civilian careers."
     )
 
+
 # Initialize session state
 def initialize_session_state():
     """Initialize session state variables for messages and chat history."""
@@ -195,6 +209,7 @@ def initialize_session_state():
     if "document_content" not in st.session_state:
         st.session_state.document_content = ""
 
+
 # Introduce the assistant
 def introduce_assistant():
     """Introduce the VetsAI Assistant."""
@@ -205,6 +220,7 @@ def introduce_assistant():
         )
         st.session_state.messages.append({"role": "assistant", "content": intro_message})
 
+
 # Display chat history
 def display_chat_messages():
     """Display existing chat messages stored in session state."""
@@ -216,6 +232,7 @@ def display_chat_messages():
             with st.chat_message("assistant"):
                 st.markdown(f"VetsAI: {message['content']}")
 
+
 # Main function to run the VetsAI Assistant app
 def main():
     """Main function to run the VetsAI Assistant app."""
@@ -231,5 +248,6 @@ def main():
     # Handle user input and chat
     handle_user_input(job_codes)
 
+
 if __name__ == "__main__":
     main()
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,25 @@
+import io
+import pytest
+import os
+
+TEST_RESOURCE_DIR = f"{os.path.dirname(__file__)}/resources"
+
+
+def load_resource_file(file_name):
+    with open(file_name, "rb") as file:
+        data = io.BytesIO(file.read())
+    return data
+
+
+@pytest.fixture(scope="module")
+def file_resources():
+    library = {}
+    for filename in os.listdir(TEST_RESOURCE_DIR):
+        library[filename.split(".")[0]] = load_resource_file(f"{TEST_RESOURCE_DIR}/{filename}")
+    yield library
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
+    )
diff --git a/tests/resources/docx_blank.docx b/tests/resources/docx_blank.docx
diff --git a/tests/resources/docx_text_and_media.docx b/tests/resources/docx_text_and_media.docx
diff --git a/tests/resources/docx_text_only.docx b/tests/resources/docx_text_only.docx
diff --git a/tests/resources/docx_unicode_sample.docx b/tests/resources/docx_unicode_sample.docx
diff --git a/tests/resources/pdf_blank.pdf b/tests/resources/pdf_blank.pdf
diff --git a/tests/resources/pdf_text_and_media.pdf b/tests/resources/pdf_text_and_media.pdf
diff --git a/tests/resources/pdf_text_only.pdf b/tests/resources/pdf_text_only.pdf
diff --git a/tests/resources/pdf_unicode_sample.pdf b/tests/resources/pdf_unicode_sample.pdf
diff --git a/tests/test_streamlit_app.py b/tests/test_streamlit_app.py
@@ -0,0 +1,32 @@
+from streamlit_app import extract_text_from_pdf, extract_text_from_word
+import pytest
+
+
+class TestDOCXExtraction:
+    def test_extract_text_from_word_with_only_text(self, file_resources):
+        assert extract_text_from_word(file_resources["docx_text_only"]) == "This document has text!"
+
+    def test_extract_text_from_word_with_empty_file(self, file_resources):
+        assert extract_text_from_word(file_resources["docx_blank"]) == ""
+
+    def test_extract_text_from_word_with_non_text_contents(self, file_resources):
+        assert extract_text_from_word(file_resources["docx_text_and_media"]) == "This document has text!"
+
+    def test_extract_text_from_word_with_special_characters(self, file_resources):
+        assert extract_text_from_word(file_resources["docx_unicode_sample"])
+
+
+class TestPDFExtraction:
+    def test_extract_text_from_pdf_with_only_text(self, file_resources):
+        assert extract_text_from_pdf(file_resources["pdf_text_only"]) == "This document has text!"
+
+    def test_extract_text_from_pdf_with_empty_file(self, file_resources):
+        assert extract_text_from_pdf(file_resources["pdf_blank"]) == ""
+
+    def test_extract_text_from_pdf_with_non_text_contents(self, file_resources):
+        # PyPDF2 will pull the text from charts also, so we cannot use == to compare
+        assert "This document has text!" in extract_text_from_pdf(file_resources["pdf_text_and_media"])
+
+    @pytest.mark.slow
+    def test_extract_text_from_pdf_with_special_characters(self, file_resources):
+        assert extract_text_from_pdf(file_resources["pdf_unicode_sample"])
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,8 @@ __pycache__/ @@
     # C extensions
     *.so
+    .DS_Store
     # Distribution / packaging
     .Python
     build/
@@ Expand Down @@