python-docx adding bold and non-bold strings to same cell in table

I'm using python-docx to create a document with a table I want to populate from textual data. My text looks like this:

01:02:10.3 
a: Lorem ipsum dolor sit amet,  
b: consectetur adipiscing elit.
a: Mauris a turpis erat. 
01:02:20.4 
a: Vivamus dignissim aliquam
b: Nam ultricies
(etc.)

I need to organize it in a table like this (using ASCII for visualization):

+---+--------------------+---------------------------------+
|   |         A          |                B                |
+---+--------------------+---------------------------------+
| 1 | 01:02:10.3         | a: Lorem ipsum dolor sit amet,  |
| 2 |                    | b: consectetur adipiscing elit. |
| 3 |                    | a: Mauris a turpis erat.        |
| 4 | ------------------ | ------------------------------- |
| 5 | 01:02:20.4         | a: Vivamus dignissim aliqua     |
| 6 |                    | b: Nam ultricies                |
+---+--------------------+---------------------------------+

however, I need to make it so everything after "a: " is bold, and everything after "b: " isn't, while they both occupy the same cell. It's pretty easy to iterate and organize this the way I want, but I'm really unsure about how to make only some of the lines bold:

IS_BOLD = { 
    'a': True
    'b': False
}

row_cells = table.add_row().cells

for line in lines: 
    if is_timestamp(line): # function that uses regex to discern between columns
        if row_cells[1]:
            row_cells = table.add_row().cells

        row_cells[0].text = line

    else 
        row_cells[1].text += line

        if IS_BOLD[ line.split(":")[0] ]:
            # make only this line within the cell bold, somehow.

(this is sort of pseudo-code, I'm doing some more textual processing but that's kinda irrelevant here). I found one probably relevant question where someone uses something called run but I'm finding it hard to understand how to apply it to my case.

Any help? Thanks.

You need to add run in the cell's paragraph. This way you can control the specific text you wish to bold

Full example:

from docx import Document
from docx.shared import Inches
import os
import re


def is_timestamp(line):
    # it's flaky, I saw you have your own method and probably you did a better job parsing this.
    return re.match(r'^\d{2}:\d{2}:\d{2}', line) is not None


def parse_raw_script(raw_script):
    current_timestamp = ''
    current_content = ''
    for line in raw_script.splitlines():
        line = line.strip()
        if is_timestamp(line):
            if current_timestamp:
                yield {
                    'timestamp': current_timestamp,
                    'content': current_content
                }

            current_timestamp = line
            current_content = ''
            continue

        if current_content:
            current_content += '\n'

        current_content += line

    if current_timestamp:
        yield {
            'timestamp': current_timestamp,
            'content': current_content
        }


def should_bold(line):
    # i leave it to you to replace with your logic
    return line.startswith('a:')


def load_raw_script():
    # I placed here the example from your question. read from file instead I presume

    return '''01:02:10.3 
a: Lorem ipsum dolor sit amet,  
b: consectetur adipiscing elit.
a: Mauris a turpis erat. 
01:02:20.4 
a: Vivamus dignissim aliquam
b: Nam ultricies'''


def convert_raw_script_to_docx(raw_script, output_file_path):
    document = Document()
    table = document.add_table(rows=1, cols=3, style="Table Grid")

    # add header row
    header_row = table.rows[0]
    header_row.cells[0].text = ''
    header_row.cells[1].text = 'A'
    header_row.cells[2].text = 'B'

    # parse the raw script into something iterable
    script_rows = parse_raw_script(raw_script)

    # create a row for each timestamp row
    for script_row in script_rows:
        timestamp = script_row['timestamp']
        content = script_row['content']

        row = table.add_row()
        timestamp_cell = row.cells[1]
        timestamp_cell.text = timestamp

        content_cell = row.cells[2]
        content_paragraph = content_cell.paragraphs[0]  # using the cell's default paragraph here instead of creating one

        for line in content.splitlines():
            run = content_paragraph.add_run(line)
            if should_bold(line):
                run.bold = True

            run.add_break()

    # resize table columns (optional)
    for row in table.rows:
        row.cells[0].width = Inches(0.2)
        row.cells[1].width = Inches(1.9)
        row.cells[2].width = Inches(3.9)

    document.save(output_file_path)


def main():
    script_dir = os.path.dirname(__file__)
    dist_dir = os.path.join(script_dir, 'dist')

    if not os.path.isdir(dist_dir):
        os.makedirs(dist_dir)

    output_file_path = os.path.join(dist_dir, 'so-template.docx')
    raw_script = load_raw_script()
    convert_raw_script_to_docx(raw_script, output_file_path)


if __name__ == '__main__':
    main()

Result (file should be in ./dist/so-template.docx):

BTW - if you prefer sticking with your own example, this is what needs to be changed:

IS_BOLD = {
    'a': True,
    'b': False
}

row_cells = table.add_row().cells

for line in lines:
    if is_timestamp(line):
        if row_cells[1]:
            row_cells = table.add_row().cells
        row_cells[0].text = line

    else:
        run = row_cells[1].paragraphs[0].add_run(line)
        if IS_BOLD[line.split(":")[0]]:
            run.bold = True

        run.add_break()

Recommended topics

Hot tags