novel-stats/novel_stats/novel_stats.py

#!/usr/bin/python


import argparse
import collections
import tempfile

CHAPTER_MARKER = '## '
STATUS_MARKER = '[status]: # '
ACT_MARKER = '[act]: # '
# Standard markdown comment marker, supported by Pandoc and Calibre's ebook-convert.
COMMENT_MARKER = '[//]: # '


def count_words(line):
    count = 0

    for word in line.strip().split(' '):
        if not word.strip() or word == '*' or word.startswith('#'):
            continue

        count += 1

    return count


def main():
    # Better argument parsing
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c',
        '--chapter',
        action='store_true',
        help='output chapter-by-chapter breakdown of word counts, including how many words in each chapter are tagged with which status',
    )
    parser.add_argument(
        '-a',
        '--act',
        action='store_true',
        help='output act-by-act breakdown of word counts (total only)',
    )
    parser.add_argument(
        '-pp',
        action='store_true',
        help='run markdown pre-processor, this allows for a multi-file input (e.g. each chapter in its own file), but requires the MarkdownPP python library',
    )
    parser.add_argument(
        'markdown_file',
        type=argparse.FileType('r'),
        help='The markdown file for the novel, main file if a multi-file novel',
    )
    arguments = parser.parse_args()

    mdfile = None

    if arguments.pp:
        # -pp flag to allow Markdown Preprocessing primarily to allow multi-file novel formatting
        # this is implemented using a temporary file created using python's buit-in tempfile library
        import MarkdownPP

        mdfile = tempfile.TemporaryFile(mode='w+')
        MarkdownPP.MarkdownPP(
            input=arguments.markdown_file, output=mdfile, modules=list(MarkdownPP.modules)
        )
        mdfile.seek(0)
    else:
        mdfile = arguments.markdown_file

    chapter_heading = None
    act_heading = None
    total_word_count = 0
    word_count_by_chapter = collections.defaultdict(int)
    word_count_by_status = collections.defaultdict(int)
    word_count_by_act = collections.defaultdict(int)
    status_by_chapter = collections.defaultdict(lambda: collections.defaultdict(int))
    current_status = None

    for line in mdfile.readlines():
        if line.startswith(CHAPTER_MARKER):
            word_count_by_act[act_heading] += word_count_by_chapter[chapter_heading]
            total_word_count += word_count_by_chapter[chapter_heading]

            chapter_heading = line[len(CHAPTER_MARKER) :].strip('()\n')

            # Count the words in chapter heading, because the chapter number and title count as words.
            if chapter_heading:
                word_count_by_chapter[chapter_heading] = count_words(chapter_heading)
                current_status = None
        # Modified to allow multiple statuses in a single chapter, can swap back and forth.
        elif line.startswith(STATUS_MARKER):
            if current_status is None:
                current_status = line[len(STATUS_MARKER) :].strip('()\n')
                if chapter_heading:
                    status_by_chapter[chapter_heading][current_status] = count_words(
                        chapter_heading
                    )
            else:
                current_status = line[len(STATUS_MARKER) :].strip('()\n')
        elif line.startswith(ACT_MARKER):
            act_heading = line[len(ACT_MARKER) :].strip('()\n')
            word_count_by_act[act_heading] = count_words(act_heading)
        elif line.startswith(COMMENT_MARKER):  # Don't count the words in a comment.
            pass
        else:
            line_word_count = count_words(line)
            word_count_by_chapter[chapter_heading] += line_word_count

            if current_status:
                word_count_by_status[current_status] += line_word_count
                status_by_chapter[chapter_heading][current_status] += line_word_count

    mdfile.close()

    # Do some final accounting after the last chapter.
    word_count_by_act[act_heading] += word_count_by_chapter[chapter_heading]
    total_word_count += word_count_by_chapter[chapter_heading]

    # -c or --chapter to give a chapter-by-chapter word count summary.
    if arguments.chapter:
        for chapter_heading, chapter_word_count in word_count_by_chapter.items():
            if chapter_heading is None:
                continue

            if len(status_by_chapter[chapter_heading]) > 1:
                print(f'chapter {chapter_heading}:')

                for chapter_status, status_count in status_by_chapter[chapter_heading].items():
                    print(f'\t {status_count:,} ({chapter_status})')
                print(f'\t {chapter_word_count:,} words (total)')
            elif len(status_by_chapter[chapter_heading]) == 1:
                chapter_status = list(status_by_chapter[chapter_heading].keys())[0]
                print(f'chapter {chapter_heading}: {chapter_word_count:,} ({chapter_status})')
            else:
                print(f'chapter {chapter_heading}: {chapter_word_count:,}')

        print()

    # -a or --act to give an act-by-act word count summary.
    if arguments.act:
        for act_heading, act_word_count in word_count_by_act.items():
            if act_heading is None:
                continue

            print(
                f'act {act_heading}: {act_word_count:,} words (~{act_word_count * 100// total_word_count}%)'
            )

        print()

    for status, status_word_count in word_count_by_status.items():
        print(
            f'{status}: {status_word_count:,} words (~{status_word_count * 100 // total_word_count}%)'
        )

    print(f'total: {total_word_count:,} words')


if __name__ == '__main__':
    main()
Initial import. 2021-09-11 16:35:38 +00:00			`#!/usr/bin/python`


Cleaned up argument parsing 2021-10-23 12:08:12 +00:00			`import argparse`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00			`import collections`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`import tempfile`
Initial import. 2021-09-11 16:35:38 +00:00
			`CHAPTER_MARKER = '## '`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00			`STATUS_MARKER = '[status]: # '`
			`ACT_MARKER = '[act]: # '`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`# Standard markdown comment marker, supported by Pandoc and Calibre's ebook-convert.`
			`COMMENT_MARKER = '[//]: # '`
Initial import. 2021-09-11 16:35:38 +00:00

			`def count_words(line):`
			`count = 0`

			`for word in line.strip().split(' '):`
			`if not word.strip() or word == '*' or word.startswith('#'):`
			`continue`

			`count += 1`

			`return count`


Fix arguments error. 2021-09-11 16:57:48 +00:00			`def main():`
Switch argument parsing to argparse 2021-10-23 11:49:23 +00:00			`# Better argument parsing`
			`parser = argparse.ArgumentParser()`
Cleaned up argument parsing 2021-10-23 12:08:12 +00:00			`parser.add_argument(`
			`'-c',`
			`'--chapter',`
			`action='store_true',`
			`help='output chapter-by-chapter breakdown of word counts, including how many words in each chapter are tagged with which status',`
			`)`
			`parser.add_argument(`
			`'-a',`
			`'--act',`
			`action='store_true',`
			`help='output act-by-act breakdown of word counts (total only)',`
			`)`
			`parser.add_argument(`
			`'-pp',`
			`action='store_true',`
			`help='run markdown pre-processor, this allows for a multi-file input (e.g. each chapter in its own file), but requires the MarkdownPP python library',`
			`)`
			`parser.add_argument(`
			`'markdown_file',`
			`type=argparse.FileType('r'),`
			`help='The markdown file for the novel, main file if a multi-file novel',`
			`)`
Switch argument parsing to argparse 2021-10-23 11:49:23 +00:00			`arguments = parser.parse_args()`

Commented my changes + swapped to tempfile library 2021-10-22 12:28:13 +00:00			`mdfile = None`
Added mdpp support 2021-10-22 09:37:33 +00:00
Switch argument parsing to argparse 2021-10-23 11:49:23 +00:00			`if arguments.pp:`
Commented my changes + swapped to tempfile library 2021-10-22 12:28:13 +00:00			`# -pp flag to allow Markdown Preprocessing primarily to allow multi-file novel formatting`
			`# this is implemented using a temporary file created using python's buit-in tempfile library`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`import MarkdownPP`

Commented my changes + swapped to tempfile library 2021-10-22 12:28:13 +00:00			`mdfile = tempfile.TemporaryFile(mode='w+')`
Cleaned up argument parsing 2021-10-23 12:08:12 +00:00			`MarkdownPP.MarkdownPP(`
			`input=arguments.markdown_file, output=mdfile, modules=list(MarkdownPP.modules)`
			`)`
Commented my changes + swapped to tempfile library 2021-10-22 12:28:13 +00:00			`mdfile.seek(0)`
			`else:`
Cleaned up argument parsing 2021-10-23 12:08:12 +00:00			`mdfile = arguments.markdown_file`
Added mdpp support 2021-10-22 09:37:33 +00:00
Allow act/chapter titles changed chapter_number and act_number to chapter_heading and act_heading 2021-10-22 08:41:03 +00:00			`chapter_heading = None`
			`act_heading = None`
Initial import. 2021-09-11 16:35:38 +00:00			`total_word_count = 0`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00			`word_count_by_chapter = collections.defaultdict(int)`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00			`word_count_by_status = collections.defaultdict(int)`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00			`word_count_by_act = collections.defaultdict(int)`
Account for a few more edge cases. 2022-01-10 17:59:18 +00:00			`status_by_chapter = collections.defaultdict(lambda: collections.defaultdict(int))`
Added multi-status chapter support 2021-10-22 09:24:36 +00:00			`current_status = None`
Initial import. 2021-09-11 16:35:38 +00:00
Commented my changes + swapped to tempfile library 2021-10-22 12:28:13 +00:00			`for line in mdfile.readlines():`
Initial import. 2021-09-11 16:35:38 +00:00			`if line.startswith(CHAPTER_MARKER):`
Allow act/chapter titles changed chapter_number and act_number to chapter_heading and act_heading 2021-10-22 08:41:03 +00:00			`word_count_by_act[act_heading] += word_count_by_chapter[chapter_heading]`
			`total_word_count += word_count_by_chapter[chapter_heading]`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`chapter_heading = line[len(CHAPTER_MARKER) :].strip('()\n')`
Initial import. 2021-09-11 16:35:38 +00:00
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`# Count the words in chapter heading, because the chapter number and title count as words.`
Account for a few more edge cases. 2022-01-10 17:59:18 +00:00			`if chapter_heading:`
			`word_count_by_chapter[chapter_heading] = count_words(chapter_heading)`
			`current_status = None`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`# Modified to allow multiple statuses in a single chapter, can swap back and forth.`
			`elif line.startswith(STATUS_MARKER):`
			`if current_status is None:`
			`current_status = line[len(STATUS_MARKER) :].strip('()\n')`
Account for a few more edge cases. 2022-01-10 17:59:18 +00:00			`if chapter_heading:`
Fix formatting. 2022-01-10 19:49:05 +00:00			`status_by_chapter[chapter_heading][current_status] = count_words(`
			`chapter_heading`
			`)`
Added multi-status chapter support 2021-10-22 09:24:36 +00:00			`else:`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`current_status = line[len(STATUS_MARKER) :].strip('()\n')`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00			`elif line.startswith(ACT_MARKER):`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`act_heading = line[len(ACT_MARKER) :].strip('()\n')`
Allow act/chapter titles changed chapter_number and act_number to chapter_heading and act_heading 2021-10-22 08:41:03 +00:00			`word_count_by_act[act_heading] = count_words(act_heading)`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`elif line.startswith(COMMENT_MARKER): # Don't count the words in a comment.`
added support for markdown comments 2021-10-22 08:36:58 +00:00			`pass`
Initial import. 2021-09-11 16:35:38 +00:00			`else:`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00			`line_word_count = count_words(line)`
Allow act/chapter titles changed chapter_number and act_number to chapter_heading and act_heading 2021-10-22 08:41:03 +00:00			`word_count_by_chapter[chapter_heading] += line_word_count`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00
Added multi-status chapter support 2021-10-22 09:24:36 +00:00			`if current_status:`
			`word_count_by_status[current_status] += line_word_count`
			`status_by_chapter[chapter_heading][current_status] += line_word_count`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00
moved mdfile.close() 2021-10-22 13:37:18 +00:00			`mdfile.close()`

Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00			`# Do some final accounting after the last chapter.`
Allow act/chapter titles changed chapter_number and act_number to chapter_heading and act_heading 2021-10-22 08:41:03 +00:00			`word_count_by_act[act_heading] += word_count_by_chapter[chapter_heading]`
			`total_word_count += word_count_by_chapter[chapter_heading]`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`# -c or --chapter to give a chapter-by-chapter word count summary.`
Switch argument parsing to argparse 2021-10-23 11:49:23 +00:00			`if arguments.chapter:`
Made chapter/act summaries optional 2021-10-22 09:54:19 +00:00			`for chapter_heading, chapter_word_count in word_count_by_chapter.items():`
			`if chapter_heading is None:`
			`continue`

			`if len(status_by_chapter[chapter_heading]) > 1:`
			`print(f'chapter {chapter_heading}:')`

			`for chapter_status, status_count in status_by_chapter[chapter_heading].items():`
			`print(f'\t {status_count:,} ({chapter_status})')`
			`print(f'\t {chapter_word_count:,} words (total)')`
			`elif len(status_by_chapter[chapter_heading]) == 1:`
			`chapter_status = list(status_by_chapter[chapter_heading].keys())[0]`
			`print(f'chapter {chapter_heading}: {chapter_word_count:,} ({chapter_status})')`
			`else:`
			`print(f'chapter {chapter_heading}: {chapter_word_count:,}')`
Added multi-status chapter support 2021-10-22 09:24:36 +00:00
Made chapter/act summaries optional 2021-10-22 09:54:19 +00:00			`print()`
Initial import. 2021-09-11 16:35:38 +00:00
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`# -a or --act to give an act-by-act word count summary.`
Switch argument parsing to argparse 2021-10-23 11:49:23 +00:00			`if arguments.act:`
Made chapter/act summaries optional 2021-10-22 09:54:19 +00:00			`for act_heading, act_word_count in word_count_by_act.items():`
			`if act_heading is None:`
			`continue`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`print(`
Switch argument parsing to argparse 2021-10-23 11:49:23 +00:00			`f'act {act_heading}: {act_word_count:,} words (~{act_word_count * 100// total_word_count}%)'`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`)`
Fixing some bugs by printing after counting rather than during. 2021-09-11 23:26:56 +00:00
Made chapter/act summaries optional 2021-10-22 09:54:19 +00:00			`print()`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00
			`for status, status_word_count in word_count_by_status.items():`
Tests! And code formatting. 2021-10-22 20:44:56 +00:00			`print(`
			`f'{status}: {status_word_count:,} words (~{status_word_count * 100 // total_word_count}%)'`
			`)`
Pull chapter/act metadata from Markdown "comments in the source file. 2021-09-11 22:39:20 +00:00
Commented my changes + swapped to tempfile library 2021-10-22 12:28:13 +00:00			`print(f'total: {total_word_count:,} words')`
Initial import. 2021-09-11 16:35:38 +00:00

			`if __name__ == '__main__':`
Fix arguments error. 2021-09-11 16:57:48 +00:00			`main()`