odfjs/scripts/odf/odt/getOdtTextContent.js

import { ZipReader, Uint8ArrayReader, TextWriter } from '@zip.js/zip.js';
import {parseXML, Node} from '../../DOMUtils.js'

/** @import {ODTFile} from '../templating/fillOdtTemplate.js' */

/**
 * @param {ODTFile} odtFile 
 * @returns {Promise<Document>}
 */
export async function getContentDocument(odtFile) {
    const reader = new ZipReader(new Uint8ArrayReader(new Uint8Array(odtFile)));

    const entries = await reader.getEntries();

    const contentEntry = entries.find(entry => entry.filename === 'content.xml');

    if (!contentEntry) {
        throw new Error('No content.xml found in the ODT file');
    }

    // @ts-ignore
    const contentText = await contentEntry.getData(new TextWriter());
    await reader.close();

    return parseXML(contentText)
}

/**
 * 
 * @param {Document} odtDocument 
 * @returns {Element}
 */
function getODTTextElement(odtDocument) {
    return odtDocument.getElementsByTagName('office:body')[0]
        .getElementsByTagName('office:text')[0]
}

/**
 * Extracts plain text content from an ODT file, preserving line breaks
 * @param {ArrayBuffer} odtFile - The ODT file as an ArrayBuffer
 * @returns {Promise<string>} Extracted text content
 */
export async function getOdtTextContent(odtFile) {
    const contentDocument = await getContentDocument(odtFile)
    const odtTextElement = getODTTextElement(contentDocument)

    /**
     * 
     * @param {Element} element 
     * @returns {string}
     */
    function getElementTextContent(element){
        //console.log('tagName', element.tagName)
        if(element.tagName === 'text:h' || element.tagName === 'text:p')
            return element.textContent + '\n'
        else{
            const descendantTexts = Array.from(element.childNodes)
                .filter(n => n.nodeType === Node.ELEMENT_NODE)
                .map(getElementTextContent)

            if(element.tagName === 'text:list-item')
                return `- ${descendantTexts.join('')}`

            return descendantTexts.join('')
        }
    }

    return getElementTextContent(odtTextElement)
}
expose odt text function (#1) * Remove xlsx support * Restructure exports to avoid duplication of DOM-related code * browser DOM exports * Fixing exports field in package.json 2025-04-17 17:39:08 +02:00			`import { ZipReader, Uint8ArrayReader, TextWriter } from '@zip.js/zip.js';`
			`import {parseXML, Node} from '../../DOMUtils.js'`

Formatted markers (#5) * Adding failing test case * test runs and fails * passing formatting test * passing test * refactoring moving different parts to their own files * Refactoring - doc in prepareTemplateDOMTree * Test of 2 formatted markers within same Text node passes * passing test with {#each ...} and text before partially formatted * Test with {/each} and text after partially formatted passing * woops with proper test case * test with partially formatted variable passes 2025-05-08 17:13:51 +02:00			`/** @import {ODTFile} from '../templating/fillOdtTemplate.js' */`
expose odt text function (#1) * Remove xlsx support * Restructure exports to avoid duplication of DOM-related code * browser DOM exports * Fixing exports field in package.json 2025-04-17 17:39:08 +02:00
			`/**`
			`* @param {ODTFile} odtFile`
			`* @returns {Promise<Document>}`
			`*/`
Add images (#16) * add template * Rename template * Add test for insert 2 images * image marker regex * Ajout d'un test pour vérifier que le texte du template est bon * WORK IN PROGRESS - trouver et évaluer la balise image * Create OfjsImage type * create addImageToOdtFile * Regenerate yo odt to inspect it * Add a draw image and a draw frame into odt file * Test if there are two draw:image in the generated document * Add pictures in manifest.xml to fix corrupted file * Adapt anchor type * Fix images aspect with ratio 2025-09-16 16:43:47 +02:00			`export async function getContentDocument(odtFile) {`
expose odt text function (#1) * Remove xlsx support * Restructure exports to avoid duplication of DOM-related code * browser DOM exports * Fixing exports field in package.json 2025-04-17 17:39:08 +02:00			`const reader = new ZipReader(new Uint8ArrayReader(new Uint8Array(odtFile)));`

			`const entries = await reader.getEntries();`

			`const contentEntry = entries.find(entry => entry.filename === 'content.xml');`

			`if (!contentEntry) {`
			`throw new Error('No content.xml found in the ODT file');`
			`}`

			`// @ts-ignore`
			`const contentText = await contentEntry.getData(new TextWriter());`
			`await reader.close();`

			`return parseXML(contentText)`
			`}`

			`/**`
			`*`
			`* @param {Document} odtDocument`
			`* @returns {Element}`
			`*/`
			`function getODTTextElement(odtDocument) {`
			`return odtDocument.getElementsByTagName('office:body')[0]`
			`.getElementsByTagName('office:text')[0]`
			`}`

			`/**`
			`* Extracts plain text content from an ODT file, preserving line breaks`
			`* @param {ArrayBuffer} odtFile - The ODT file as an ArrayBuffer`
			`* @returns {Promise<string>} Extracted text content`
			`*/`
			`export async function getOdtTextContent(odtFile) {`
			`const contentDocument = await getContentDocument(odtFile)`
			`const odtTextElement = getODTTextElement(contentDocument)`

			`/**`
			`*`
			`* @param {Element} element`
			`* @returns {string}`
			`*/`
			`function getElementTextContent(element){`
			`//console.log('tagName', element.tagName)`
			`if(element.tagName === 'text:h' \|\| element.tagName === 'text:p')`
			`return element.textContent + '\n'`
			`else{`
			`const descendantTexts = Array.from(element.childNodes)`
			`.filter(n => n.nodeType === Node.ELEMENT_NODE)`
			`.map(getElementTextContent)`

			`if(element.tagName === 'text:list-item')`
			return `- ${descendantTexts.join('')}`

			`return descendantTexts.join('')`
			`}`
			`}`

			`return getElementTextContent(odtTextElement)`
			`}`