diff --git a/app/general/admin.py b/app/general/admin.py index 171d219f..bbdbb8be 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -13,12 +13,12 @@ class DocumentForm(ModelForm): class Meta: model = Document fields = "__all__" # noqa: DJ007 + # Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead + widgets = {"document_data": HiddenInput()} def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.fields["document_data"].widget = HiddenInput() - # If the instance has a mime_type, the field should be disabled if not self.instance.mime_type: self.fields["mime_type"].widget = HiddenInput() @@ -30,7 +30,14 @@ def clean(self): url = cleaned_data.get("url", "") uploaded_file = cleaned_data.get("uploaded_file", "") - if uploaded_file: + # We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually + # We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to + # support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if + # the PDF just overrode the text that they explicitly pasted into that field on the same form page! + override_existing_fulltext = ( + "uploaded_file" in self.changed_data and "document_data" not in self.changed_data + ) + if uploaded_file and override_existing_fulltext: file_type = magic.from_buffer(uploaded_file.read(), mime=True) if file_type != "application/pdf": self.add_error("uploaded_file", _("Only PDF files are allowed.")) @@ -64,6 +71,12 @@ def clean(self): return cleaned_data +class DocumentFormWithFulltext(DocumentForm): + class Meta: + model = Document + fields = "__all__" # noqa: DJ007 + + class DocumentAdmin(SimpleHistoryAdmin): ordering = ["title"] list_display = ["title", "license", "document_type", "available"] @@ -72,6 +85,12 @@ class DocumentAdmin(SimpleHistoryAdmin): form = DocumentForm history_list_display = ["title", "license", "document_type", "available"] + def get_form(self, request, *args, **kwargs): + # Show the fulltext field if the user has the requisite permission + if request.user.has_perm("general.can_edit_fulltext"): + kwargs["form"] = DocumentFormWithFulltext + return super(DocumentAdmin, self).get_form(request, *args, **kwargs) + class SubjectAdmin(SimpleHistoryAdmin): ordering = ["name"] diff --git a/app/general/migrations/0014_alter_document_options.py b/app/general/migrations/0014_alter_document_options.py new file mode 100644 index 00000000..cb69acca --- /dev/null +++ b/app/general/migrations/0014_alter_document_options.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.8 on 2024-11-05 09:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0013_rename_documentfile_document_and_more'), + ] + + operations = [ + migrations.AlterModelOptions( + name='document', + options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'}, + ), + ] diff --git a/app/general/migrations/0015_alter_document_document_data.py b/app/general/migrations/0015_alter_document_document_data.py new file mode 100644 index 00000000..53551af7 --- /dev/null +++ b/app/general/migrations/0015_alter_document_document_data.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.8 on 2024-11-07 09:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0014_alter_document_options'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='document_data', + field=models.TextField(blank=True, help_text="This is the document's full-text which users can search through. By default, this is extracted from the document's PDF (if exists), but it can also be edited.", verbose_name='Searchable content'), + ), + ] diff --git a/app/general/models.py b/app/general/models.py index 0e56d660..bb167193 100644 --- a/app/general/models.py +++ b/app/general/models.py @@ -135,7 +135,14 @@ class Document(models.Model): document_type = models.CharField( max_length=200, choices=document_type_choices, verbose_name=_("document category") ) - document_data = models.TextField(blank=True, verbose_name=_("document data")) + document_data = models.TextField( + blank=True, + verbose_name=_("Searchable content"), + help_text=_( + "This is the document's full-text which users can search through. " + "By default, this is extracted from the document's PDF (if exists), but it can also be edited." + ), + ) institution = models.ForeignKey( "Institution", on_delete=models.CASCADE, verbose_name=_("institution") ) @@ -167,5 +174,7 @@ class Meta: GinIndex(fields=["search_vector"]), ] + permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")] + def __str__(self): return self.title diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py index c2051d46..a3c6bf4d 100644 --- a/app/general/tests/test_document_admin.py +++ b/app/general/tests/test_document_admin.py @@ -2,9 +2,11 @@ import unittest from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase +from django.urls import reverse -from general.admin import DocumentForm -from general.models import Institution +from general.admin import DocumentForm, DocumentFormWithFulltext +from general.models import Document, Institution class TestDocumentForm(unittest.TestCase): @@ -20,7 +22,9 @@ def setUp(self): pdf_file = f.read() self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf") - self.test_institution = Institution.objects.create(name="Test Institution for Document tests") + self.test_institution = Institution.objects.create( + name="Test Institution for Document tests" + ) def tearDown(self): self.test_institution.delete() @@ -96,6 +100,94 @@ def test_clean_with_large_file(self): self.assertIn("uploaded_file", form.errors) self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."]) + def test_edited_pdf_takes_priority_over_unedited_fulltext(self): + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Test both kinds of forms - both can have edited PDFs and unedited fulltext + for FormType in [DocumentForm, DocumentFormWithFulltext]: + form = FormType(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertNotEqual(form.cleaned_data["document_data"], "") + + def test_edited_fulltext_takes_priority_over_edited_pdf(self): + custom_data = "testingstringthatisnotinsidelorem.pdf" + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": custom_data, + "description": "Test description", + } + + # We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext + form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertEqual(form.cleaned_data["document_data"], custom_data) + + def test_edited_fulltext_reflects_in_database_and_search(self): + title = "Test document with edited fulltext" + custom_data = "testingstringthatisnotinsidelorem.pdf" + + original_form = { + "title": title, + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Upload the form with the PDF fulltext extracted + form = DocumentForm(original_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + doc = form.save() + orig_fulltext = doc.document_data + self.assertNotEqual(orig_fulltext, custom_data) + + # Check we can search by PDF content + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Now, upload a copy with edited fulltext + edit_form = {**original_form, "document_data": custom_data, "id": doc.id} + form = DocumentFormWithFulltext( + edit_form, files={"uploaded_file": self.file_mock}, instance=doc + ) + self.assertTrue(form.is_valid()) + doc = form.save() + self.assertEqual(doc.document_data, custom_data) + + # Check that: 1. we only have one document with this title, and 2. it has the correct fulltext + self.assertEqual(Document.objects.get(title=title).document_data, custom_data) + + # Check we can search by the new content too + response = self.client.get(reverse("search"), {"search": custom_data}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Check that we CANNOT search by the old content anymore + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(len(response.context["page_obj"]), 0) + if __name__ == "__main__": unittest.main()