Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enhance PDF text extraction with PDF.js fallback and dynamic fi… #41

Merged
merged 2 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,6 @@
"@react-three/drei": "8.20.2",
"@react-three/fiber": "7.0.25",
"classnames": "2.5.1",
"convertapi-js": "^1.0.8",
"dompurify": "^3.2.1",
"ellipsed": "1.6.0",
"i18next": "22.0.6",
Expand Down
257 changes: 139 additions & 118 deletions src/components/UploadButton/UploadButton.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import React, { useState, useRef, useEffect } from 'react';
import React, { useState, useRef } from 'react';
import cx from 'classnames';
import ConvertApi from 'convertapi-js';
import UploadIcon from '../icons/Upload';
import Spin from '../ui/Spin';
import Alert from '../ui/Alert';
Expand All @@ -14,13 +13,22 @@ type UploadError = {

/**
* FileUploadButton component allows users to upload and convert files to text
* Supports PDF, DOC, DOCX and TXT files up to 10MB
* Converts files to text using ConvertAPI service
* Supports PDF and TXT files up to 10MB
* Extracts text from PDFs using PDF.js
*/

const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB
const MAX_TEXT_LENGTH = 100000; // 100,000 characters
const ALLOWED_FILE_TYPES = ['.pdf', '.doc', '.docx', '.txt'];
const PDF_JS_VERSION = '3.11.174'; // Last stable version with .min.js files
const WORKER_URL = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${PDF_JS_VERSION}/pdf.worker.min.js`;
const PDF_JS_URL = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${PDF_JS_VERSION}/pdf.min.js`;

// Add type definition for pdfjsLib
declare global {
interface Window {
pdfjsLib: any;
}
}

const FileUploadButton = ({
setPreviewFiles,
Expand All @@ -31,21 +39,19 @@ const FileUploadButton = ({
}) => {
// State for loading indicator
const [isLoading, setIsLoading] = useState(false);
// State for tracking upload errors
// State for tracking upload errors
const [errors, setErrors] = useState<UploadError[]>([]);
// Reference to hidden file input
const fileInputRef = useRef<HTMLInputElement>(null);
// State for ConvertAPI authentication token
const [convertapiToken, setConvertapiToken] = useState<string>();

// Clear all errors
const clearErrors = () => setErrors([]);

// Remove a specific error by message
const removeError = (errorMessage: string) => {
setErrors(prev => prev.filter(e => e.message !== errorMessage));
};

// Add a new error and auto-remove after 5 seconds
const addError = (error: UploadError) => {
setErrors(prev => [...prev, error]);
Expand All @@ -56,30 +62,57 @@ const FileUploadButton = ({
};

/**
* Fetches ConvertAPI token from backend service
* Displays error if token fetch fails
* Extracts text from PDF using PDF.js
* @param file PDF file to process
* @returns Promise resolving to extracted text
*/
const fetchConvertapiToken = async () => {
const extractTextFromPDF = async (file: File): Promise<string> => {
try {
const result = await fetch('https://www.aisuru.com/api/convertapi-token');
const response = await result.json();
if (!response.Tokens?.[0]?.Id) {
throw new Error('Invalid token response');
// Load PDF.js if not already loaded
if (!window.pdfjsLib) {
await new Promise((resolve, reject) => {
const script = document.createElement('script');
script.src = PDF_JS_URL;
script.onload = () => {
// Set up worker
window.pdfjsLib.GlobalWorkerOptions.workerSrc = WORKER_URL;
resolve(true);
};
script.onerror = reject;
document.head.appendChild(script);
});
}

// Extract text from PDF
const arrayBuffer = await file.arrayBuffer();
// Get PDF document
const pdf = await window.pdfjsLib.getDocument({ data: arrayBuffer })
.promise;
let text = '';

// Iterate through each page and extract text
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const content = await page.getTextContent();
// Filter out non-string items and join text
const pageText = content.items
.filter((item: any) => item.str && typeof item.str === 'string')
.map((item: any) => item.str)
.join(' ');
text += pageText + '\n';
}
setConvertapiToken(response.Tokens[0].Id);

// Return extracted text
return text;
} catch (error) {
addError({
message: 'Failed to initialize file conversion service. Please try again later.',
severity: 'error'
});
throw new Error(
`PDF extraction failed: ${
error instanceof Error ? error.message : 'Unknown error'
}`
);
}
};

// Fetch token on component mount
useEffect(() => {
fetchConvertapiToken();
}, []);

/**
* Validates uploaded file
* Checks file type and size restrictions
Expand All @@ -88,21 +121,26 @@ const FileUploadButton = ({
*/
const validateFile = (file: File): boolean => {
const fileExt = `.${file.name.split('.').pop()?.toLowerCase()}`;
const ALLOWED_FILE_TYPES = ['.pdf', '.txt'];

if (!ALLOWED_FILE_TYPES.includes(fileExt)) {
addError({
message: `File type "${fileExt}" is not supported. Please use: ${ALLOWED_FILE_TYPES.join(', ')}`,
message: `File type "${fileExt}" is not supported. Please use: ${ALLOWED_FILE_TYPES.join(
', '
)}`,
severity: 'error',
fileId: file.name
fileId: file.name,
});
return false;
}

if (file.size > MAX_FILE_SIZE) {
addError({
message: `File "${file.name}" exceeds ${MAX_FILE_SIZE / 1024 / 1024}MB limit`,
message: `File "${file.name}" exceeds ${
MAX_FILE_SIZE / 1024 / 1024
}MB limit`,
severity: 'error',
fileId: file.name
fileId: file.name,
});
return false;
}
Expand All @@ -111,98 +149,82 @@ const FileUploadButton = ({
};

/**
* Converts uploaded file to text using ConvertAPI
* @param file File to convert
* @returns Promise resolving to converted text or null if conversion fails
* Processes file to extract text content
* @param file File to process
* @returns Promise resolving to extracted text or null if processing fails
*/
const convertToTxt = async (file: File): Promise<string | null> => {
if (!convertapiToken) {
addError({
message: 'File conversion service not initialized',
severity: 'error'
});
return null;
}

const fileExt = file.name.split('.').pop()?.toLowerCase() || 'pdf';
const processFile = async (file: File): Promise<string | null> => {
const fileExt = file.name.split('.').pop()?.toLowerCase() || '';

try {
// Initialize ConvertAPI with token
const convertApi = ConvertApi.auth(convertapiToken);
const params = convertApi.createParams();
params.add('File', file);
params.add('TextEncoding', 'UTF-8');
params.add('PageRange', '1-2000');

// Convert file to text
const result = await convertApi.convert(fileExt, 'txt', params);
const fileUrl = result.files[0].Url;

// Fetch converted text content
const response = await fetch(fileUrl);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
let text: string | null = null;

if (fileExt === 'pdf') {
text = await extractTextFromPDF(file);
} else if (fileExt === 'txt') {
text = await file.text();
}
const text = await response.text();

// Check text length limit
if (text.length > MAX_TEXT_LENGTH) {
if (text && text.length > MAX_TEXT_LENGTH) {
addError({
message: `File "${file.name}" content exceeds ${MAX_TEXT_LENGTH} characters`,
severity: 'error',
fileId: file.name
fileId: file.name,
});
return null;
}

return text;
} catch (error) {
addError({
message: `Failed to convert "${file.name}": ${error instanceof Error ? error.message : 'Unknown error'}`,
message: `Failed to process "${file.name}": ${
error instanceof Error ? error.message : 'Unknown error'
}`,
severity: 'error',
fileId: file.name
fileId: file.name,
});
return null;
}
};

/**
* Handles file selection event
* Validates files and converts them to text
* Updates preview files state with converted content
* Validates files and processes them to extract text
* Updates preview files state with processed content
*/
const handleFileSelect = async (e: React.ChangeEvent<HTMLInputElement>) => {
const files = Array.from(e.target.files || []);
if (files.length === 0) return;

setIsLoading(true);
clearErrors();

const newPreviewFiles: { name: string; id: string; content: string }[] = [];

// Process each selected file
for (const file of files) {
if (!validateFile(file)) continue;

const fileId = Math.random().toString(36).substr(2, 9);
const text = await convertToTxt(file);
const text = await processFile(file);

if (text) {
newPreviewFiles.push({
name: file.name,
id: fileId,
content: text
content: text,
});
}
}

// Update preview files if any conversions succeeded
// Update preview files if any processing succeeded
if (newPreviewFiles.length > 0) {
setPreviewFiles(newPreviewFiles);
if (newPreviewFiles.length < files.length) {
addError({
message: 'Some files were not processed successfully',
severity: 'warning'
severity: 'warning',
});
}
}
Expand All @@ -213,56 +235,55 @@ const FileUploadButton = ({
}
};

console.log(errors);

return (
<div className="relative file-upload-wrapper">
{/* Hidden file input triggered by button click */}
<input
ref={fileInputRef}
type="file"
accept=".pdf,.doc,.docx,.txt"
className="memori--upload-file-input"
onChange={handleFileSelect}
multiple
/>

{/* Upload button with loading state */}
<button
className={cx(
'memori-button',
'memori-button--circle',
'memori-button--icon-only',
'memori-share-button--button',
'memori--conversation-button',
{ 'memori--error': errors.length > 0 }
)}
onClick={() => fileInputRef.current?.click()}
disabled={isLoading}
title="Upload file"
>
{isLoading ? (
<Spin spinning className="memori--upload-icon" />
) : (
<UploadIcon className="memori--upload-icon" />
)}
</button>

{/* Error messages container */}
<div className="memori--error-message-container">
{errors.map((error, index) => (
<Alert
key={`${error.message}-${index}`}
open={true}
type={error.severity}
title={error.message}
onClose={() => removeError(error.message)}
width="300px"
/>
))}
{/* Hidden file input triggered by button click */}
<input
ref={fileInputRef}
type="file"
accept=".pdf,.txt"
className="memori--upload-file-input"
onChange={handleFileSelect}
multiple
/>

{/* Upload button with loading state */}
<button
className={cx(
'memori-button',
'memori-button--circle',
'memori-button--icon-only',
'memori-share-button--button',
'memori--conversation-button',
{ 'memori--error': errors.length > 0 }
)}
onClick={() => fileInputRef.current?.click()}
disabled={isLoading}
title="Upload file"
>
{isLoading ? (
<Spin spinning className="memori--upload-icon" />
) : (
<UploadIcon className="memori--upload-icon" />
)}
</button>

{/* Error messages container */}
<div className="memori--error-message-container">
{errors.map((error, index) => (
<Alert
key={`${error.message}-${index}`}
open={true}
type={error.severity}
title={'File upload failed'}
description={error.message}
onClose={() => removeError(error.message)}
width="350px"
/>
))}
</div>
</div>
</div>
);
};

export default FileUploadButton;
export default FileUploadButton;
Loading