Skip to content

Commit

Permalink
Add unit tests for text extraction functions (#7)
Browse files Browse the repository at this point in the history
* Add Unit Tests for Text Extraction Functions

* Reduce file size for unicode test resources, add pytest-cov config, add gh workflow for automatic unit tests

* Fix test resource file loading to use context manager for safer handling
  • Loading branch information
jonulak authored Oct 24, 2024
1 parent 164d92c commit e4aaafd
Show file tree
Hide file tree
Showing 20 changed files with 113 additions and 7 deletions.
Binary file removed .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[run]
omit =
tests/*
*/tests/*
23 changes: 23 additions & 0 deletions .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Run Unit Tests with Pytest

on: [ push, pull_request ]

jobs:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run tests with coverage report
run: |
pytest --cov --cov-report=term-missing
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ __pycache__/
# C extensions
*.so

.DS_Store

# Distribution / packaging
.Python
build/
Expand Down
Binary file removed data/.DS_Store
Binary file not shown.
Binary file removed data/employment_transitions/.DS_Store
Binary file not shown.
Binary file removed data/employment_transitions/job_codes/.DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ PyPDF2
python-docx
python-dotenv
openai
pytest
pytest-cov
32 changes: 25 additions & 7 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@
# Get the API key from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Function to inject custom CSS for light mode
def inject_custom_css():
st.markdown("""
<!-- Your existing CSS styles -->
""", unsafe_allow_html=True)


inject_custom_css()


# Function to read and extract text from PDFs
def extract_text_from_pdf(file):
try:
Expand All @@ -47,6 +50,7 @@ def extract_text_from_pdf(file):
raise TypeError("Invalid PDF file.")
return text


# Function to read and extract text from Word documents
def extract_text_from_word(file):
try:
Expand All @@ -57,16 +61,19 @@ def extract_text_from_word(file):
except BadZipfile as e:
raise TypeError("Invalid docx file. File may be password protected or corrupted.")


# Function to load military job codes from the directories (TXT format)
def load_military_job_codes(base_path):
# Your existing implementation
pass


# Function to translate military job code to civilian job suggestions
def translate_job_code(job_code, job_codes):
# Your existing implementation
pass


# Fetch response from OpenAI using the API key with increased timeout
def fetch_from_model(conversation):
"""Send a request to OpenAI using the conversation history."""
Expand Down Expand Up @@ -99,15 +106,16 @@ def fetch_from_model(conversation):
except httpx.RequestError as e:
st.error(f"An error occurred while making a request to OpenAI: {e}")
return "Error communicating with the OpenAI API."

except Exception as e:
st.error(f"An unexpected error occurred: {e}")
return "Unexpected error while fetching response."


# Callback to process user input and clear it afterward
def process_input(job_codes):
user_input = st.session_state["temp_input"]

if user_input:
# Store user input into chat history
st.session_state.messages.append({"role": "user", "content": user_input})
Expand All @@ -117,7 +125,8 @@ def process_input(job_codes):

# Include document content in the system prompt if available
if "document_content" in st.session_state and st.session_state["document_content"]:
conversation[0]["content"] += f" The user has provided the following document content to assist you: {st.session_state['document_content']}"
conversation[0][
"content"] += f" The user has provided the following document content to assist you: {st.session_state['document_content']}"

# Append previous messages, being mindful of token limits
for msg in st.session_state.messages[-10:]: # Adjust the number of messages as needed
Expand All @@ -132,10 +141,11 @@ def process_input(job_codes):
# Clear the temporary input
st.session_state["temp_input"] = ""


# Handle user input and job code translation along with resume upload
def handle_user_input(job_codes):
"""Handle user input for translating military job codes to civilian jobs, uploading resumes, and chatting."""

# Display chat messages first
display_chat_messages()

Expand All @@ -156,6 +166,8 @@ def handle_user_input(job_codes):
if uploaded_file.size > 20 * 1024 * 1024:
raise ValueError("File size is too large. Uploaded files must be less than 20 MB.")

print(type(uploaded_file))

file_text = ""

if uploaded_file.type == "application/pdf":
Expand All @@ -171,11 +183,12 @@ def handle_user_input(job_codes):
st.error(e)

# Input field for user queries (job code or general chat) at the bottom
st.text_input("Enter your military job code (e.g., 11B, AFSC, MOS) or ask a question:",
key="temp_input",
on_change=process_input,
st.text_input("Enter your military job code (e.g., 11B, AFSC, MOS) or ask a question:",
key="temp_input",
on_change=process_input,
args=(job_codes,))


# Display the app title and description
def display_title_and_description():
"""Display the app title and description."""
Expand All @@ -185,6 +198,7 @@ def display_title_and_description():
"to help veterans navigate employment transitions and find opportunities in civilian careers."
)


# Initialize session state
def initialize_session_state():
"""Initialize session state variables for messages and chat history."""
Expand All @@ -195,6 +209,7 @@ def initialize_session_state():
if "document_content" not in st.session_state:
st.session_state.document_content = ""


# Introduce the assistant
def introduce_assistant():
"""Introduce the VetsAI Assistant."""
Expand All @@ -205,6 +220,7 @@ def introduce_assistant():
)
st.session_state.messages.append({"role": "assistant", "content": intro_message})


# Display chat history
def display_chat_messages():
"""Display existing chat messages stored in session state."""
Expand All @@ -216,6 +232,7 @@ def display_chat_messages():
with st.chat_message("assistant"):
st.markdown(f"VetsAI: {message['content']}")


# Main function to run the VetsAI Assistant app
def main():
"""Main function to run the VetsAI Assistant app."""
Expand All @@ -231,5 +248,6 @@ def main():
# Handle user input and chat
handle_user_input(job_codes)


if __name__ == "__main__":
main()
Empty file added tests/__init__.py
Empty file.
25 changes: 25 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import io
import pytest
import os

TEST_RESOURCE_DIR = f"{os.path.dirname(__file__)}/resources"


def load_resource_file(file_name):
with open(file_name, "rb") as file:
data = io.BytesIO(file.read())
return data


@pytest.fixture(scope="module")
def file_resources():
library = {}
for filename in os.listdir(TEST_RESOURCE_DIR):
library[filename.split(".")[0]] = load_resource_file(f"{TEST_RESOURCE_DIR}/{filename}")
yield library


def pytest_configure(config):
config.addinivalue_line(
"markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
)
Binary file added tests/resources/docx_blank.docx
Binary file not shown.
Binary file added tests/resources/docx_text_and_media.docx
Binary file not shown.
Binary file added tests/resources/docx_text_only.docx
Binary file not shown.
Binary file added tests/resources/docx_unicode_sample.docx
Binary file not shown.
Binary file added tests/resources/pdf_blank.pdf
Binary file not shown.
Binary file added tests/resources/pdf_text_and_media.pdf
Binary file not shown.
Binary file added tests/resources/pdf_text_only.pdf
Binary file not shown.
Binary file added tests/resources/pdf_unicode_sample.pdf
Binary file not shown.
32 changes: 32 additions & 0 deletions tests/test_streamlit_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from streamlit_app import extract_text_from_pdf, extract_text_from_word
import pytest


class TestDOCXExtraction:
def test_extract_text_from_word_with_only_text(self, file_resources):
assert extract_text_from_word(file_resources["docx_text_only"]) == "This document has text!"

def test_extract_text_from_word_with_empty_file(self, file_resources):
assert extract_text_from_word(file_resources["docx_blank"]) == ""

def test_extract_text_from_word_with_non_text_contents(self, file_resources):
assert extract_text_from_word(file_resources["docx_text_and_media"]) == "This document has text!"

def test_extract_text_from_word_with_special_characters(self, file_resources):
assert extract_text_from_word(file_resources["docx_unicode_sample"])


class TestPDFExtraction:
def test_extract_text_from_pdf_with_only_text(self, file_resources):
assert extract_text_from_pdf(file_resources["pdf_text_only"]) == "This document has text!"

def test_extract_text_from_pdf_with_empty_file(self, file_resources):
assert extract_text_from_pdf(file_resources["pdf_blank"]) == ""

def test_extract_text_from_pdf_with_non_text_contents(self, file_resources):
# PyPDF2 will pull the text from charts also, so we cannot use == to compare
assert "This document has text!" in extract_text_from_pdf(file_resources["pdf_text_and_media"])

@pytest.mark.slow
def test_extract_text_from_pdf_with_special_characters(self, file_resources):
assert extract_text_from_pdf(file_resources["pdf_unicode_sample"])

0 comments on commit e4aaafd

Please sign in to comment.