Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit tests for text extraction functions #7

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[run]
omit =
tests/*
*/tests/*
23 changes: 23 additions & 0 deletions .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Run Unit Tests with Pytest

on: [ push, pull_request ]

jobs:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt

- name: Run tests with coverage report
run: |
pytest --cov --cov-report=term-missing
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ __pycache__/
# C extensions
*.so

.DS_Store

# Distribution / packaging
.Python
build/
Expand Down
Binary file removed data/.DS_Store
Binary file not shown.
Binary file removed data/employment_transitions/.DS_Store
Binary file not shown.
Binary file removed data/employment_transitions/job_codes/.DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ PyPDF2
python-docx
python-dotenv
openai
pytest
pytest-cov
32 changes: 25 additions & 7 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@
# Get the API key from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Function to inject custom CSS for light mode
def inject_custom_css():
st.markdown("""
<!-- Your existing CSS styles -->
""", unsafe_allow_html=True)


inject_custom_css()


# Function to read and extract text from PDFs
def extract_text_from_pdf(file):
try:
Expand All @@ -47,6 +50,7 @@ def extract_text_from_pdf(file):
raise TypeError("Invalid PDF file.")
return text


# Function to read and extract text from Word documents
def extract_text_from_word(file):
try:
Expand All @@ -57,16 +61,19 @@ def extract_text_from_word(file):
except BadZipfile as e:
raise TypeError("Invalid docx file. File may be password protected or corrupted.")


# Function to load military job codes from the directories (TXT format)
def load_military_job_codes(base_path):
# Your existing implementation
pass


# Function to translate military job code to civilian job suggestions
def translate_job_code(job_code, job_codes):
# Your existing implementation
pass


# Fetch response from OpenAI using the API key with increased timeout
def fetch_from_model(conversation):
"""Send a request to OpenAI using the conversation history."""
Expand Down Expand Up @@ -99,15 +106,16 @@ def fetch_from_model(conversation):
except httpx.RequestError as e:
st.error(f"An error occurred while making a request to OpenAI: {e}")
return "Error communicating with the OpenAI API."

except Exception as e:
st.error(f"An unexpected error occurred: {e}")
return "Unexpected error while fetching response."


# Callback to process user input and clear it afterward
def process_input(job_codes):
user_input = st.session_state["temp_input"]

if user_input:
# Store user input into chat history
st.session_state.messages.append({"role": "user", "content": user_input})
Expand All @@ -117,7 +125,8 @@ def process_input(job_codes):

# Include document content in the system prompt if available
if "document_content" in st.session_state and st.session_state["document_content"]:
conversation[0]["content"] += f" The user has provided the following document content to assist you: {st.session_state['document_content']}"
conversation[0][
"content"] += f" The user has provided the following document content to assist you: {st.session_state['document_content']}"

# Append previous messages, being mindful of token limits
for msg in st.session_state.messages[-10:]: # Adjust the number of messages as needed
Expand All @@ -132,10 +141,11 @@ def process_input(job_codes):
# Clear the temporary input
st.session_state["temp_input"] = ""


# Handle user input and job code translation along with resume upload
def handle_user_input(job_codes):
"""Handle user input for translating military job codes to civilian jobs, uploading resumes, and chatting."""

# Display chat messages first
display_chat_messages()

Expand All @@ -156,6 +166,8 @@ def handle_user_input(job_codes):
if uploaded_file.size > 20 * 1024 * 1024:
raise ValueError("File size is too large. Uploaded files must be less than 20 MB.")

print(type(uploaded_file))

file_text = ""

if uploaded_file.type == "application/pdf":
Expand All @@ -171,11 +183,12 @@ def handle_user_input(job_codes):
st.error(e)

# Input field for user queries (job code or general chat) at the bottom
st.text_input("Enter your military job code (e.g., 11B, AFSC, MOS) or ask a question:",
key="temp_input",
on_change=process_input,
st.text_input("Enter your military job code (e.g., 11B, AFSC, MOS) or ask a question:",
key="temp_input",
on_change=process_input,
args=(job_codes,))


# Display the app title and description
def display_title_and_description():
"""Display the app title and description."""
Expand All @@ -185,6 +198,7 @@ def display_title_and_description():
"to help veterans navigate employment transitions and find opportunities in civilian careers."
)


# Initialize session state
def initialize_session_state():
"""Initialize session state variables for messages and chat history."""
Expand All @@ -195,6 +209,7 @@ def initialize_session_state():
if "document_content" not in st.session_state:
st.session_state.document_content = ""


# Introduce the assistant
def introduce_assistant():
"""Introduce the VetsAI Assistant."""
Expand All @@ -205,6 +220,7 @@ def introduce_assistant():
)
st.session_state.messages.append({"role": "assistant", "content": intro_message})


# Display chat history
def display_chat_messages():
"""Display existing chat messages stored in session state."""
Expand All @@ -216,6 +232,7 @@ def display_chat_messages():
with st.chat_message("assistant"):
st.markdown(f"VetsAI: {message['content']}")


# Main function to run the VetsAI Assistant app
def main():
"""Main function to run the VetsAI Assistant app."""
Expand All @@ -231,5 +248,6 @@ def main():
# Handle user input and chat
handle_user_input(job_codes)


if __name__ == "__main__":
main()
Empty file added tests/__init__.py
Empty file.
25 changes: 25 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import io
import pytest
import os

TEST_RESOURCE_DIR = f"{os.path.dirname(__file__)}/resources"


def load_resource_file(file_name):
with open(file_name, "rb") as file:
data = io.BytesIO(file.read())
return data


@pytest.fixture(scope="module")
def file_resources():
library = {}
for filename in os.listdir(TEST_RESOURCE_DIR):
library[filename.split(".")[0]] = load_resource_file(f"{TEST_RESOURCE_DIR}/{filename}")
yield library


def pytest_configure(config):
config.addinivalue_line(
"markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')"
)
Binary file added tests/resources/docx_blank.docx
Binary file not shown.
Binary file added tests/resources/docx_text_and_media.docx
Binary file not shown.
Binary file added tests/resources/docx_text_only.docx
Binary file not shown.
Binary file added tests/resources/docx_unicode_sample.docx
Binary file not shown.
Binary file added tests/resources/pdf_blank.pdf
Binary file not shown.
Binary file added tests/resources/pdf_text_and_media.pdf
Binary file not shown.
Binary file added tests/resources/pdf_text_only.pdf
Binary file not shown.
Binary file added tests/resources/pdf_unicode_sample.pdf
Binary file not shown.
32 changes: 32 additions & 0 deletions tests/test_streamlit_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from streamlit_app import extract_text_from_pdf, extract_text_from_word
import pytest


class TestDOCXExtraction:
def test_extract_text_from_word_with_only_text(self, file_resources):
assert extract_text_from_word(file_resources["docx_text_only"]) == "This document has text!"

def test_extract_text_from_word_with_empty_file(self, file_resources):
assert extract_text_from_word(file_resources["docx_blank"]) == ""

def test_extract_text_from_word_with_non_text_contents(self, file_resources):
assert extract_text_from_word(file_resources["docx_text_and_media"]) == "This document has text!"

def test_extract_text_from_word_with_special_characters(self, file_resources):
assert extract_text_from_word(file_resources["docx_unicode_sample"])


class TestPDFExtraction:
def test_extract_text_from_pdf_with_only_text(self, file_resources):
assert extract_text_from_pdf(file_resources["pdf_text_only"]) == "This document has text!"

def test_extract_text_from_pdf_with_empty_file(self, file_resources):
assert extract_text_from_pdf(file_resources["pdf_blank"]) == ""

def test_extract_text_from_pdf_with_non_text_contents(self, file_resources):
# PyPDF2 will pull the text from charts also, so we cannot use == to compare
assert "This document has text!" in extract_text_from_pdf(file_resources["pdf_text_and_media"])

@pytest.mark.slow
def test_extract_text_from_pdf_with_special_characters(self, file_resources):
assert extract_text_from_pdf(file_resources["pdf_unicode_sample"])