Skip to content

Commit

Permalink
feat(community): added code blocks in markdown into document's page c…
Browse files Browse the repository at this point in the history
…ontent (#7178)
  • Loading branch information
FaresKi authored Nov 11, 2024
1 parent 831f9de commit 6db5fb8
Showing 1 changed file with 32 additions and 7 deletions.
39 changes: 32 additions & 7 deletions libs/langchain-community/src/document_loaders/web/confluence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,27 +218,52 @@ export class ConfluencePagesLoader extends BaseDocumentLoader {
* @returns A Document instance.
*/
private createDocumentFromPage(page: ConfluencePage): Document {
const htmlContent = page.body.storage.value;

// Handle both self-closing and regular macros for attachments and view-file
const htmlWithoutOtherMacros = htmlContent.replace(
/<ac:structured-macro\s+ac:name="(attachments|view-file)"[^>]*(?:\/?>|>.*?<\/ac:structured-macro>)/gs,
"[ATTACHMENT]"
);

// Extract and preserve code blocks with unique placeholders
const codeBlocks: { language: string; code: string }[] = [];
const htmlWithPlaceholders = htmlWithoutOtherMacros.replace(
/<ac:structured-macro.*?<ac:parameter ac:name="language">(.*?)<\/ac:parameter>.*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body><\/ac:structured-macro>/g,
(_, language, code) => {
const placeholder = `CODE_BLOCK_${codeBlocks.length}`;
codeBlocks.push({ language, code: code.trim() });
return `\n${placeholder}\n`;
}
);

// Convert the HTML content to plain text
const plainTextContent = htmlToText(page.body.storage.value, {
let plainTextContent = htmlToText(htmlWithPlaceholders, {
wordwrap: false,
preserveNewlines: false,
preserveNewlines: true,
});

// Reinsert code blocks with proper markdown formatting
codeBlocks.forEach(({ language, code }, index) => {
const placeholder = `CODE_BLOCK_${index}`;
plainTextContent = plainTextContent.replace(
placeholder,
`\`\`\`${language}\n${code}\n\`\`\``
);
});

// Remove empty lines
const textWithoutEmptyLines = plainTextContent.replace(/^\s*[\r\n]/gm, "");

// Generate the URL
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;

// Return a langchain document
// Rest of the method remains the same...
return new Document({
pageContent: textWithoutEmptyLines,
metadata: {
id: page.id,
status: page.status,
title: page.title,
type: page.type,
url: pageUrl,
url: `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`,
version: page.version?.number,
updated_by: page.version?.by?.displayName,
updated_at: page.version?.when,
Expand Down

0 comments on commit 6db5fb8

Please sign in to comment.