Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extraction of numbering and multi-level sign #1472

Open
lottopotato opened this issue Feb 26, 2025 · 2 comments
Open

Extraction of numbering and multi-level sign #1472

lottopotato opened this issue Feb 26, 2025 · 2 comments

Comments

@lottopotato
Copy link

lottopotato commented Feb 26, 2025

Hello everyone.

I need to extract numbering and multilevel lists for my work, so I wrote the following extraction code:

def get_qn_name(tag_name):
    return docx.oxml.ns.qn(tag_name)

def get_numbering_part(doc):
    numbering_part_element = doc.part.numbering_part.element
    w_val = get_qn_name('w:val')
    # 1. Locate all w:num elements. Each w:num element contains a numId attribute that associates it with a paragraph.
    w_num = get_qn_name('w:num')
    num_elements = numbering_part_element.findall(w_num)
    
    # 2. Locate all abstractNumId elements. Typically, there is one abstractNumId element.
    absNumId_to_numId = {}
    w_abstractNumId = get_qn_name('w:abstractNumId')
    w_numId = get_qn_name('w:numId')
    for num_element in num_elements:
        abstractNumId = num_element.findall(w_abstractNumId)
        if len(abstractNumId) == 0:
            continue
        abstractNumId = abstractNumId[0]
        abstractNumId = abstractNumId.get(w_val)
        numId = num_element.get(w_numId)
        if abstractNumId is not None and numId is not None:
            absNumId_to_numId.update({abstractNumId: numId})
    
    # 3. Locate all abstractNum elements.
    w_abstractNum = get_qn_name('w:abstractNum')
    abstractNum_elements = numbering_part_element.findall(w_abstractNum)
    
    # 4. Within each abstractNum element, examine the abstractNumId, lvl, lvlText, and numFmt elements. 
    # Under normal circumstances, there would be only one lvlText and one numFmt element for each level.
    w_lvl = get_qn_name('w:lvl')
    w_ilvl = get_qn_name('w:ilvl')
    w_lvlText = get_qn_name('w:lvlText')
    w_numFmt = get_qn_name('w:numFmt')
    
    numbering_part = {}
    for abstractNum_element in abstractNum_elements:
        abstractNumId = abstractNum_element.get(w_abstractNumId)
        if abstractNumId is None:
            continue
        bucket = {}
        lvl_elements = abstractNum_element.findall(w_lvl)
        numFmt = 'decimal'
        for lvl_element in lvl_elements:
            ilvl = lvl_element.get(w_ilvl)
            if ilvl is None:
                continue
                
            lvlText_elements = lvl_element.findall(w_lvlText)
            numFmt_elements = lvl_element.findall(w_numFmt)
            if len(lvlText_elements) == 0 or len(numFmt_elements) == 0:
                continue
            numFmt = numFmt_elements[0].get(w_val)
            text = lvlText_elements[0].get(w_val)
            bucket.update({ilvl:[text, numFmt]})
            
        if abstractNumId in absNumId_to_numId.keys():
            numbering_part.update({
                absNumId_to_numId[abstractNumId]: bucket
            })
    return numbering_part

def get_known_formats():
    def generate_roman_numerals(limit = 100, uppercase = True):
        roman_map = { 1: 'I', 4: 'IV', 5: 'V', 9: 'IX', 10: 'X', 40: 'XL', 50: 'L', 90: 'XC', 100: 'C'}

        roman_numerals = []
        for i in range(1, limit + 1):
            result = ""
            for value, numeral in sorted(roman_map.items(), reverse=True):
                while i >= value:
                    result += numeral
                    i -= value
            result = result if uppercase else result.lower()
            roman_numerals.append(result)
        return roman_numerals
    
    ENG_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    upperLetter = list(ENG_LETTERS) + [
        f'{a}{b}'
        for a in list(ENG_LETTERS) for b in list(ENG_LETTERS)   
    ]
    lowerLetter = list(ENG_LETTERS.lower()) + [
        f'{a}{b}'
        for a in list(ENG_LETTERS.lower()) for b in list(ENG_LETTERS.lower())
    ]
    upperRoman = generate_roman_numerals()
    lowerRoman = generate_roman_numerals(uppercase = False)
    return {
        'upperLetter': upperLetter,
        'lowerLetter': lowerLetter,
        'upperRoman': upperRoman,
        'lowerRoman': lowerRoman
    }

def get_string_for_format(format, stack_number, known_formats):
    if format in known_formats.keys():
        if len(known_formats[format]) > stack_number:
            return known_formats[format][stack_number]
    return stack_number+1

def apply_numbering(numId_val, ilvl_val, numbering_part, numbering_part_stack, known_formats):
    numbering = numbering_part[numId_val]
    if not numId_val in numbering_part_stack.keys():
        numbering_part_stack.update({numId_val: {}})
        
    number_format, format = numbering[ilvl_val]
    ilvl_val = int(ilvl_val)
    if not ilvl_val in numbering_part_stack[numId_val].keys():
        numbering_part_stack[numId_val].update({ilvl_val: 0})
    else:
        for drop in range(max(numbering_part_stack[numId_val].keys())-ilvl_val):
            drop = ilvl_val+drop+1
            if drop in numbering_part_stack[numId_val].keys():
                numbering_part_stack[numId_val].pop(drop)
        numbering_part_stack[numId_val][ilvl_val] += 1
    search = re.findall(r'(\%\d+)', number_format)
    if len(search) == 0:
        return number_format
    number_format = re.sub(r'(\%\d+)', '{}', number_format)
    
    format_letters = []
    for _ in range(len(search)):
        stack_number = numbering_part_stack[numId_val][ilvl_val]
        _, format = numbering[str(ilvl_val)]
        letter = get_string_for_format(format, stack_number, known_formats)
        format_letters.append(letter)
        ilvl_val -= 1
    
    return number_format.format(*format_letters[::-1]), numbering_part_stack

def get_ppr_val(para):
    if not isinstance(para, docx.oxml.text.paragraph.CT_P):
        return None, None
    ilvl_val = None
    numId_val = None
    val_name = docx.oxml.ns.qn('w:val')
    ppr = para.pPr
    if ppr is not None:
        numpr = ppr.numPr
        if numpr is not None:
            numId = numpr.numId
            if numId is not None:
                numId_val = numId.get(val_name)    
            ilvl = numpr.ilvl
            if ilvl is not None:
                ilvl_val = ilvl.get(val_name)
            
    return numId_val, ilvl_val

Image
Image

numbering_part = get_numbering_part(doc)
known_formats = get_known_formats()
numbering_part_stack = {}
for i, element in  enumerate(doc.element.body):
    text = element.text
    numId_val, ilvl_val = get_ppr_val(element)
    if not text is None:
        print(i)
        print('-'*50)
        if not numId_val is None:
            prefix, numbering_part_stack = apply_numbering(
                numId_val, ilvl_val, 
                numbering_part, numbering_part_stack, known_formats)
            print(f'{prefix} {text}')
        else:
            print(text)
        print('-'*50)
        print()
0
--------------------------------------------------
Numbering
--------------------------------------------------

1
--------------------------------------------------
1. 1
--------------------------------------------------

2
--------------------------------------------------
A. 1-A
--------------------------------------------------

3
--------------------------------------------------
i. 1-A-Roman 1
--------------------------------------------------

4
--------------------------------------------------
ii. 1-A-Roman 2
--------------------------------------------------

5
--------------------------------------------------
B. 1-B
--------------------------------------------------

6
--------------------------------------------------
2. 2
--------------------------------------------------

7
--------------------------------------------------
3. 3
--------------------------------------------------

8
--------------------------------------------------
4. 4
--------------------------------------------------

9
--------------------------------------------------
Step1. step1
--------------------------------------------------

10
--------------------------------------------------
Step2. step2
--------------------------------------------------

11
--------------------------------------------------
Step3. step3
--------------------------------------------------

12
--------------------------------------------------
1 Circle 1
--------------------------------------------------

13
--------------------------------------------------
A. Circle 1-A
--------------------------------------------------

14
--------------------------------------------------
B. Circle 1-B
--------------------------------------------------

15
--------------------------------------------------
i. Circle 1-B-Roman 1
--------------------------------------------------

16
--------------------------------------------------
ii. Circle 1-B-Roman 2
--------------------------------------------------

17
--------------------------------------------------
C. Circle 1-C
--------------------------------------------------

18
--------------------------------------------------
2 Circle 2
--------------------------------------------------

19
--------------------------------------------------
3 Circle 3
--------------------------------------------------

20
--------------------------------------------------
Multi-Level
--------------------------------------------------

21
--------------------------------------------------
1. 1
--------------------------------------------------

22
--------------------------------------------------
1.1. 1-1
--------------------------------------------------

23
--------------------------------------------------
1.1.1. 1-1-1
--------------------------------------------------

24
--------------------------------------------------
1.1.2. 1-1--2
--------------------------------------------------

25
--------------------------------------------------
1.2. 1-2
--------------------------------------------------

26
--------------------------------------------------
1.2.1. 1-2-1
--------------------------------------------------

27
--------------------------------------------------
1.3. 1-3
--------------------------------------------------

28
--------------------------------------------------
2. 2
--------------------------------------------------

29
--------------------------------------------------
3. 3
--------------------------------------------------

If you have any alternative approaches or improvements (such as supporting more formats in "known_formats"),
please share them with us. Thank you.

@Frazer-Ng
Copy link

Hi @lottopotato , thank you for your code, I was working on the same thing just this week.

I've encountered a small error on get_numbering_part(). As one w:abstractNum can have multiple w:num elements pointing to it, having absNumId_to_numId be a one-to-one mapping will not suffice. The last numId will override the earlier numIds when being assigned to the same abstractNumId

I've modified absNumId_to_numId to be a dict of list to support one-to-many relationship. Here's the updated function:

from collections import defaultdict

def get_numbering_part(doc):
    numbering_part_element = doc.part.numbering_part.element
    w_val = qn('w:val')
    # 1. Locate all w:num elements. Each w:num element contains a numId attribute that associates it with a paragraph.
    w_num = qn('w:num')
    num_elements = numbering_part_element.findall(w_num)

    # 2. Locate all abstractNumId elements. Typically, there is one abstractNumId element.
    absNumId_to_numId = defaultdict(list)
    w_abstractNumId = qn('w:abstractNumId')
    w_numId = qn('w:numId')
    for num_element in num_elements:
        abstractNumId = num_element.findall(w_abstractNumId)
        if len(abstractNumId) == 0:
            continue
        abstractNumId = abstractNumId[0]
        abstractNumId = abstractNumId.get(w_val)
        numId = num_element.get(w_numId)
        if abstractNumId is not None and numId is not None:
            absNumId_to_numId[abstractNumId].append(numId)

    # 3. Locate all abstractNum elements.
    w_abstractNum = qn('w:abstractNum')
    abstractNum_elements = numbering_part_element.findall(w_abstractNum)

    # 4. Within each abstractNum element, examine the abstractNumId, lvl, lvlText, and numFmt elements. 
    # Under normal circumstances, there would be only one lvlText and one numFmt element for each level.
    w_lvl = qn('w:lvl')
    w_ilvl = qn('w:ilvl')
    w_lvlText = qn('w:lvlText')
    w_numFmt = qn('w:numFmt')
    
    numbering_part = {}
    for abstractNum_element in abstractNum_elements:
        abstractNumId = abstractNum_element.get(w_abstractNumId)
        if abstractNumId is None:
            continue
        bucket = {}
        lvl_elements = abstractNum_element.findall(w_lvl)
        numFmt = 'decimal'
        for lvl_element in lvl_elements:
            ilvl = lvl_element.get(w_ilvl)
            if ilvl is None:
                continue
                
            lvlText_elements = lvl_element.findall(w_lvlText)
            numFmt_elements = lvl_element.findall(w_numFmt)
            if len(lvlText_elements) == 0 or len(numFmt_elements) == 0:
                continue
            numFmt = numFmt_elements[0].get(w_val)
            text = lvlText_elements[0].get(w_val)
            bucket.update({ilvl:[text, numFmt]})
            
        if abstractNumId in absNumId_to_numId.keys():
            for numId in absNumId_to_numId[abstractNumId]:
                numbering_part.update({
                    numId: bucket
                })
    return numbering_part


Additionally, as a quick fix to support bullet lists (and other unknown formats), I've modified this line in apply_numbering(), using a simple dash as a default.

    if len(search) == 0: # not a number
        return "- ", numbering_part_stack

@lottopotato
Copy link
Author

@Frazer-Ng
Thanks for your sharing!

I never thought that w:abstractNum could have multiple w:num!
Can you upload the examples for this case?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants