Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/support extension clean markdown #498

Merged
merged 9 commits into from
Feb 13, 2025
1 change: 1 addition & 0 deletions apps/api/src/knowledge/parsers/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export interface ParserOptions {
format?: string;
mockMode?: boolean;
timeout?: number;
extractMedia?: boolean;
}

export interface ParseResult {
Expand Down
53 changes: 34 additions & 19 deletions apps/api/src/knowledge/parsers/pandoc.parser.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Injectable } from '@nestjs/common';
import { Injectable, Logger } from '@nestjs/common';
import { spawn } from 'node:child_process';
import { BaseParser, ParserOptions, ParseResult } from './base';
import { ConfigService } from '@nestjs/config';
Expand All @@ -8,13 +8,16 @@ import os from 'node:os';

@Injectable()
export class PandocParser extends BaseParser {
private readonly logger = new Logger(PandocParser.name);

constructor(
private readonly config: ConfigService,
options: ParserOptions = {},
) {
super({
format: 'markdown',
timeout: 30000,
extractMedia: true,
...options,
});
}
Expand Down Expand Up @@ -50,6 +53,10 @@ export class PandocParser extends BaseParser {
return images;
}

private isWarning(stderr: string): boolean {
return stderr.toLowerCase().includes('warning');
}

async parse(input: string | Buffer): Promise<ParseResult> {
if (this.options.mockMode) {
return {
Expand All @@ -62,15 +69,14 @@ export class PandocParser extends BaseParser {
const mediaDir = path.join(tempDir, 'media');

try {
const pandoc = spawn('pandoc', [
'-f',
this.options.format,
'-t',
'commonmark-raw_html',
'--wrap=none',
'--extract-media',
tempDir,
]);
const pandocArgs = ['-f', this.options.format, '-t', 'commonmark-raw_html', '--wrap=none'];

// Only add extract-media option if enabled
if (this.options.extractMedia) {
pandocArgs.push('--extract-media', tempDir);
}

const pandoc = spawn('pandoc', pandocArgs);

return new Promise((resolve, reject) => {
let stdout = '';
Expand All @@ -86,16 +92,25 @@ export class PandocParser extends BaseParser {

pandoc.on('close', async (code) => {
try {
if (code === 0) {
const images = await this.readImagesFromDir(mediaDir);
resolve({
content: stdout,
images,
metadata: { format: this.options.format },
});
} else {
reject(new Error(`Pandoc failed with code ${code}: ${stderr}`));
// Handle warnings in stderr
if (stderr) {
if (this.isWarning(stderr)) {
this.logger.warn(`Pandoc warning: ${stderr}`);
} else if (code !== 0) {
// Only reject if it's an actual error (not a warning) and the process failed
reject(new Error(`Pandoc failed with code ${code}: ${stderr}`));
return;
}
}

// Only process images if extractMedia is enabled
const images = this.options.extractMedia ? await this.readImagesFromDir(mediaDir) : {};

resolve({
content: stdout,
images,
metadata: { format: this.options.format },
});
} finally {
await this.cleanupTempDir(tempDir);
}
Expand Down
23 changes: 23 additions & 0 deletions apps/api/src/misc/misc.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,29 @@ export class MiscController {
}

@UseGuards(JwtAuthGuard)
@Post('convert')
@UseInterceptors(FileInterceptor('file'))
async convert(
@UploadedFile() file: Express.Multer.File,
@Body() body: { from?: string; to?: string },
): Promise<{ data: { content: string } }> {
if (!file) {
throw new ParamsError('File is required');
}

const from = body.from ?? 'html';
const to = body.to ?? 'markdown';
const content = file.buffer.toString('utf-8');

const result = await this.miscService.convert({
content,
from,
to,
});

return buildSuccessResponse({ content: result });
}

@Get('static/:objectKey')
@Header('Access-Control-Allow-Origin', '*')
@Header('Cross-Origin-Resource-Policy', 'cross-origin')
Expand Down
18 changes: 18 additions & 0 deletions apps/api/src/misc/misc.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
} from '@refly-packages/errors';
import { FileObject } from '@/misc/misc.dto';
import { createId } from '@paralleldrive/cuid2';
import { ParserFactory } from '@/knowledge/parsers/factory';

@Injectable()
export class MiscService implements OnModuleInit {
Expand Down Expand Up @@ -576,4 +577,21 @@ export class MiscService implements OnModuleInit {

this.logger.log(`Successfully cleaned up ${orphanedFiles.length} orphaned files`);
}

async convert(param: { content: string; from: string; to: string }): Promise<string> {
const { content, from, to } = param;
const parserFactory = new ParserFactory(this.config);
const parser = parserFactory.createParser('pandoc', {
format: from,
extractMedia: false,
});

try {
const result = await parser.parse(content);
return result.content ?? '';
} catch (error) {
this.logger.error(`Convert from ${from} to ${to} failed: ${error?.stack}`);
throw error;
}
}
}
72 changes: 55 additions & 17 deletions apps/extension/src/components/content-clipper/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { HiOutlineDocumentDownload } from 'react-icons/hi';
import { useTranslation } from 'react-i18next';
import { useSaveSelectedContent } from '@/hooks/use-save-selected-content';
import { useSaveResourceNotify } from '@refly-packages/ai-workspace-common/hooks/use-save-resouce-notify';
import getClient from '@refly-packages/ai-workspace-common/requests/proxiedRequest';
import {
onMessage,
sendMessage,
Expand Down Expand Up @@ -37,16 +38,53 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
const { saveSelectedContent } = useSaveSelectedContent();
const { handleSaveResourceAndNotify } = useSaveResourceNotify();

const setPageContentResponse = async (data: BackgroundMessage) => {
const response = data;
if (response?.body) {
try {
// Convert HTML content to Markdown if it exists
if (response.body.content) {
// Create a Blob from the HTML content
const htmlBlob = new Blob([response.body.content], { type: 'text/html' });
const htmlFile = new File([htmlBlob], 'content.html', { type: 'text/html' });

// Create FormData and append the file
const formData = new FormData();
formData.append('file', htmlFile);

const result = await getClient().convert({
body: {
from: 'html',
to: 'markdown',
file: htmlFile,
},
});

if (result?.data?.data?.content) {
setPageInfo({
...response.body,
content: result.data.data.content,
});
} else {
setPageInfo(response.body);
}
} else {
setPageInfo(response.body);
}
} catch (err) {
console.error('Failed to convert HTML to Markdown:', err);
setPageInfo(response.body);
}
}
setIsClipping(false);
};

// Listen for content from content script
useEffect(() => {
onMessage((event: MessageEvent<any>) => {
const data = event as any as BackgroundMessage;
if (data?.name === 'getPageContentResponse') {
const response = data;
if (response?.body) {
setPageInfo(response.body);
}
setIsClipping(false);
setPageContentResponse(data);
}
}, getRuntime());
}, []);
Expand All @@ -64,7 +102,7 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
} catch (err) {
setIsClipping(false);
console.error('Failed to clip content:', err);
message.error(t('extension.webClipper.error.clipContentFailed'));
message.error(t('translation:extension.webClipper.error.clipContentFailed'));
}
}, [t]);

Expand All @@ -75,18 +113,18 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
if (text) {
setPageInfo((prev) => ({ ...prev, content: text }));
} else {
message.warning(t('extension.webClipper.error.clipboardEmpty'));
message.warning(t('translation:extension.webClipper.error.clipboardEmpty'));
}
} catch (err) {
console.error('Failed to read clipboard:', err);
message.error(t('extension.webClipper.error.clipboardReadFailed'));
message.error(t('translation:extension.webClipper.error.clipboardReadFailed'));
}
}, [t]);

// Handle save content
const handleSave = useCallback(async () => {
if (!pageInfo.content?.trim()) {
message.warning(t('extension.webClipper.error.contentRequired'));
message.warning(t('translation:extension.webClipper.error.contentRequired'));
return;
}

Expand All @@ -98,7 +136,7 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
title: pageInfo.title,
url: pageInfo.url,
});
if (result?.success) {
if (result?.res?.success) {
setTimeout(() => {
setPageInfo({ title: '', url: '', content: '' });
onSaveSuccess?.();
Expand All @@ -108,7 +146,7 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
});
} catch (err) {
console.error('Failed to save content:', err);
message.error(t('extension.webClipper.error.saveFailed'));
message.error(t('translation:extension.webClipper.error.saveFailed'));
} finally {
setIsSaving(false);
}
Expand Down Expand Up @@ -140,7 +178,7 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
<div className={`flex flex-col gap-4 p-0 ${className}`}>
<div className="flex flex-col gap-2">
<TextArea
placeholder={t('extension.webClipper.placeholder.enterOrClipContent')}
placeholder={t('translation:extension.webClipper.placeholder.enterOrClipContent')}
value={pageInfo.content}
onChange={handleContentChange}
onKeyDown={handleKeyDown}
Expand All @@ -151,10 +189,10 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
<div className="flex flex-row gap-2">
{pageInfo.content && (
<Button size="small" icon={<IconDelete />} onClick={handleClear}>
{t('extension.webClipper.action.clear')}
{t('translation:extension.webClipper.action.clear')}
</Button>
)}
<Tooltip title={t('extension.webClipper.info.saveToLibrary')}>
<Tooltip title={t('translation:extension.webClipper.info.saveToLibrary')}>
<Button
type="primary"
size="small"
Expand All @@ -165,7 +203,7 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
<div className="flex items-center justify-center rounded px-[1px] h-[12px] text-[10px] font-medium leading-none">
<span className="text-[10px] font-medium leading-none mr-2">⇧+↵</span>
<span className="text-[10px] font-medium leading-none translate-y-[0.5px]">
{t('extension.webClipper.action.save')}
{t('translation:extension.webClipper.action.save')}
</span>
</div>
</Button>
Expand All @@ -183,9 +221,9 @@ export const ContentClipper: React.FC<ContentClipperProps> = ({ className, onSav
loading={isClipping}
className="flex-1"
>
{t('extension.webClipper.action.clip')}
{t('translation:extension.webClipper.action.clip')}
</Button>
<Tooltip title={t('extension.webClipper.action.fromClipboard')}>
<Tooltip title={t('translation:extension.webClipper.action.fromClipboard')}>
<Button size="large" icon={<IconPaste />} onClick={handleGetClipboard} />
</Tooltip>
</div>
Expand Down
Loading