From b0a6b80f04a3e5d4ba5e670770998b75892b0290 Mon Sep 17 00:00:00 2001 From: Restioson Date: Tue, 5 Nov 2024 15:00:16 +0200 Subject: [PATCH] feat: allow users to edit the document fulltext This allows users that have the suitable permission to edit the fulltext field of Documents. This field is used when searching, so it allows users to remove frontmatter from PDFs which are irrelevant, or paste a plaintext version for a Google Doc link, for example. The field is only shown to those with the relevant permission to avoid confusing people who may not understand what it is for. When a PDF is edited for a Document, unless the fulltext is also edited at the same time (within the same form submission), it will take priority and overwrite the old fulltext (even if edited prior). --- app/general/admin.py | 25 ++++- .../migrations/0014_alter_document_options.py | 17 ++++ .../0015_alter_document_document_data.py | 18 ++++ app/general/models.py | 11 ++- app/general/tests/test_document_admin.py | 98 ++++++++++++++++++- 5 files changed, 162 insertions(+), 7 deletions(-) create mode 100644 app/general/migrations/0014_alter_document_options.py create mode 100644 app/general/migrations/0015_alter_document_document_data.py diff --git a/app/general/admin.py b/app/general/admin.py index 171d219f..bbdbb8be 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -13,12 +13,12 @@ class DocumentForm(ModelForm): class Meta: model = Document fields = "__all__" # noqa: DJ007 + # Hide if the user doesn't have the permission - if they do, they get a DocumentFormWithFulltext instead + widgets = {"document_data": HiddenInput()} def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.fields["document_data"].widget = HiddenInput() - # If the instance has a mime_type, the field should be disabled if not self.instance.mime_type: self.fields["mime_type"].widget = HiddenInput() @@ -30,7 +30,14 @@ def clean(self): url = cleaned_data.get("url", "") uploaded_file = cleaned_data.get("uploaded_file", "") - if uploaded_file: + # We don't unconditionally re-extract PDF text, as the fulltext (`document_data` field) can be edited manually + # We only want to re-extract the PDF text if the file has changed _and_ the fulltext has not changed. This is to + # support the use-case of a user editing both the PDF and the fulltext at the same time. It would be confusing if + # the PDF just overrode the text that they explicitly pasted into that field on the same form page! + override_existing_fulltext = ( + "uploaded_file" in self.changed_data and "document_data" not in self.changed_data + ) + if uploaded_file and override_existing_fulltext: file_type = magic.from_buffer(uploaded_file.read(), mime=True) if file_type != "application/pdf": self.add_error("uploaded_file", _("Only PDF files are allowed.")) @@ -64,6 +71,12 @@ def clean(self): return cleaned_data +class DocumentFormWithFulltext(DocumentForm): + class Meta: + model = Document + fields = "__all__" # noqa: DJ007 + + class DocumentAdmin(SimpleHistoryAdmin): ordering = ["title"] list_display = ["title", "license", "document_type", "available"] @@ -72,6 +85,12 @@ class DocumentAdmin(SimpleHistoryAdmin): form = DocumentForm history_list_display = ["title", "license", "document_type", "available"] + def get_form(self, request, *args, **kwargs): + # Show the fulltext field if the user has the requisite permission + if request.user.has_perm("general.can_edit_fulltext"): + kwargs["form"] = DocumentFormWithFulltext + return super(DocumentAdmin, self).get_form(request, *args, **kwargs) + class SubjectAdmin(SimpleHistoryAdmin): ordering = ["name"] diff --git a/app/general/migrations/0014_alter_document_options.py b/app/general/migrations/0014_alter_document_options.py new file mode 100644 index 00000000..cb69acca --- /dev/null +++ b/app/general/migrations/0014_alter_document_options.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.8 on 2024-11-05 09:30 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0013_rename_documentfile_document_and_more'), + ] + + operations = [ + migrations.AlterModelOptions( + name='document', + options={'permissions': [('can_edit_fulltext', 'Can edit document fulltext (used for search)')], 'verbose_name': 'Document', 'verbose_name_plural': 'Documents'}, + ), + ] diff --git a/app/general/migrations/0015_alter_document_document_data.py b/app/general/migrations/0015_alter_document_document_data.py new file mode 100644 index 00000000..53551af7 --- /dev/null +++ b/app/general/migrations/0015_alter_document_document_data.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.8 on 2024-11-07 09:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('general', '0014_alter_document_options'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='document_data', + field=models.TextField(blank=True, help_text="This is the document's full-text which users can search through. By default, this is extracted from the document's PDF (if exists), but it can also be edited.", verbose_name='Searchable content'), + ), + ] diff --git a/app/general/models.py b/app/general/models.py index 0e56d660..bb167193 100644 --- a/app/general/models.py +++ b/app/general/models.py @@ -135,7 +135,14 @@ class Document(models.Model): document_type = models.CharField( max_length=200, choices=document_type_choices, verbose_name=_("document category") ) - document_data = models.TextField(blank=True, verbose_name=_("document data")) + document_data = models.TextField( + blank=True, + verbose_name=_("Searchable content"), + help_text=_( + "This is the document's full-text which users can search through. " + "By default, this is extracted from the document's PDF (if exists), but it can also be edited." + ), + ) institution = models.ForeignKey( "Institution", on_delete=models.CASCADE, verbose_name=_("institution") ) @@ -167,5 +174,7 @@ class Meta: GinIndex(fields=["search_vector"]), ] + permissions = [("can_edit_fulltext", "Can edit document fulltext (used for search)")] + def __str__(self): return self.title diff --git a/app/general/tests/test_document_admin.py b/app/general/tests/test_document_admin.py index c2051d46..a3c6bf4d 100644 --- a/app/general/tests/test_document_admin.py +++ b/app/general/tests/test_document_admin.py @@ -2,9 +2,11 @@ import unittest from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase +from django.urls import reverse -from general.admin import DocumentForm -from general.models import Institution +from general.admin import DocumentForm, DocumentFormWithFulltext +from general.models import Document, Institution class TestDocumentForm(unittest.TestCase): @@ -20,7 +22,9 @@ def setUp(self): pdf_file = f.read() self.file_mock = SimpleUploadedFile("test.pdf", pdf_file, content_type="application/pdf") - self.test_institution = Institution.objects.create(name="Test Institution for Document tests") + self.test_institution = Institution.objects.create( + name="Test Institution for Document tests" + ) def tearDown(self): self.test_institution.delete() @@ -96,6 +100,94 @@ def test_clean_with_large_file(self): self.assertIn("uploaded_file", form.errors) self.assertEqual(form.errors["uploaded_file"], ["File size must not exceed 10MB."]) + def test_edited_pdf_takes_priority_over_unedited_fulltext(self): + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Test both kinds of forms - both can have edited PDFs and unedited fulltext + for FormType in [DocumentForm, DocumentFormWithFulltext]: + form = FormType(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertNotEqual(form.cleaned_data["document_data"], "") + + def test_edited_fulltext_takes_priority_over_edited_pdf(self): + custom_data = "testingstringthatisnotinsidelorem.pdf" + tests_form = { + "title": "Test", + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": custom_data, + "description": "Test description", + } + + # We only test the DocumentFormWithFulltext because this is the only one that can have edited fulltext + form = DocumentFormWithFulltext(tests_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + self.assertEqual(form.cleaned_data["document_data"], custom_data) + + def test_edited_fulltext_reflects_in_database_and_search(self): + title = "Test document with edited fulltext" + custom_data = "testingstringthatisnotinsidelorem.pdf" + + original_form = { + "title": title, + "license": "CC0", + "document_type": "Glossary", + "mime_type": "pdf", + "institution": self.test_institution, + "url": "", + "uploaded_file": self.file_mock, + "document_data": "", + "description": "Test description", + } + + # Upload the form with the PDF fulltext extracted + form = DocumentForm(original_form, files={"uploaded_file": self.file_mock}) + self.assertTrue(form.is_valid()) + doc = form.save() + orig_fulltext = doc.document_data + self.assertNotEqual(orig_fulltext, custom_data) + + # Check we can search by PDF content + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Now, upload a copy with edited fulltext + edit_form = {**original_form, "document_data": custom_data, "id": doc.id} + form = DocumentFormWithFulltext( + edit_form, files={"uploaded_file": self.file_mock}, instance=doc + ) + self.assertTrue(form.is_valid()) + doc = form.save() + self.assertEqual(doc.document_data, custom_data) + + # Check that: 1. we only have one document with this title, and 2. it has the correct fulltext + self.assertEqual(Document.objects.get(title=title).document_data, custom_data) + + # Check we can search by the new content too + response = self.client.get(reverse("search"), {"search": custom_data}) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.context["page_obj"][0]["heading"], title) + + # Check that we CANNOT search by the old content anymore + response = self.client.get(reverse("search"), {"search": orig_fulltext}) + self.assertEqual(response.status_code, 200) + self.assertEqual(len(response.context["page_obj"]), 0) + if __name__ == "__main__": unittest.main()