Pdf 文档适配 Kindle - 分行合并

Internet 上主流的 pdf 文档基本都是 A4 版面，在 kindle paperwhite 上直接阅读的体验一言难尽。如何尽可能简单地将A4幅面的文档转换为合适的格式?

2023 更新：由 kpw 更换到了掌阅的 smart X 后，8寸屏幕的 pdf 阅读体验相对可以接受了。

首先明确的是，这里指的pdf是以可复制的文字格式为主要组成部分的文档，而非扫描文档。

面向kindle的文档管理，首选的是Calibre，它已经可以将pdf文件转换为epub文档，但是转换过程中会把一整段分为很多独立的行，发送到kindle后分段错乱更明显。

参考epub电子书–目录结构介绍，Epub的格式非常容易解析，它实质上是一个包含所需标记文本、图片、元信息等的一个zip压缩包。通过修改其中的标记文本，比较容易将被分开的多行合并。

对超文本的解析走了一点弯路，最开始是打算暴力匹配的，通过正则匹配标签，然后将连续的多段合并，这样处理的问题主要是，章节标题不容易处理。

接下来简单介绍一下思路，主要用到两个包，zipfile和BeautifulSoup，后者版本是4。

main函数分三部分，解压文件，处理超文本，打包回epub格式。

def main():
    # extract epub file into appointed dir
    extract_epub("data/src.epub", "data/src")

    # format all the .html file.
    file_list = [x for x in os.listdir("data/src") if ".html" in x]
    for file in file_list:
        print(f"Handle file: {file}")
        res = handle_document("data/src/" + file)
        with open("data/src/" + file, 'w') as f:
            f.writelines(res)

    # packaging target dir into epub file.
    creat_epub("data/src", "data/dst.epub", "data/src/", log=False)
    print("Create target File.")

解压文件部分的代码：

def extract_epub(file, target_dir):
    # extract epub file into appointed dir
    with zipfile.ZipFile(file, 'r') as myzip:
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        myzip.extractall(target_dir)
        print(f"Excart file into [{target_dir}]")

处理超文本部分的代码：

def handle_document(file):
    # load file, get root and body
    (soup, lines) = load_document(file, "body")

    # define ad string
    ad_string = "更多电子书资料请搜索「书行天下」：http://www.sxpdf.com"
    delete_ad(lines, ad_string)

    # merge splited lines
    handle_lines(lines)

    for line in lines:
        if not len(line.contents):
            line.decompose()

    return soup.prettify()

处理文本部分的代码完成三件事：

删除单行广告（这里的文件广告行是独立一行，所以比较容易处理）
将被分开的多行合并，准确的讲，是将这几行的所有tag按顺序合并到一个<p>标签中。
删除多余的空白<p>标签。

其中第二部分占据了主要的工作量，代码如下：

这里的lines就是所有的<p>标签，遍历后合并，被合并的标签中置空，留待后续处理。

def handle_lines(lines):
    # re-format lines, merget splited lines into one line.
    index = 0
    toc = [x[0] for x in get_toc()]

    paragraph = [0, []]

    # clear cache lines
    def clear_cache():

            if paragraph[0]:
                lines[paragraph[0]].string = ""
                for tag in paragraph[1]:
                    lines[paragraph[0]].append(tag)
                paragraph[0] = 0
            paragraph[1] = []

    while (index < len(lines)):
        line = ''.join(list(lines[index].stripped_strings))

        # merge splited title
        (is_title, completed, title) = is_toc_prefix(toc, line)
        if is_title and not completed:
            t = [x for x in lines[index].find_all(name="a") if x.string]
            if t:
                t[0].string = title           
            lines[index + 1].decompose()
            index += 2
            clear_cache()
            continue

        elif is_title and completed:
            index += 1
            clear_cache()
            continue
            

        # single line contain img, omit:
        if lines[index].img:
            index += 1
            clear_cache()
            continue

        # merge splited normal lines
        if (25 < len(line)):
            
            if not paragraph[0]:
                paragraph[0] = index

            # whether single space between every char?
            if line.count(' ')/len(line) > 0.4 and "  " not in line:
                if not lines[index].string:
                    # may have tag <a> in lines[index]
                    # handle this situation
                    t = [x for x in lines[index].contents if x.string]
                    t[0].string = line.replace(' ', '')
                else:
                    lines[index].string = line.replace(' ', '')

            paragraph[1].extend(lines[index].contents)
            if not paragraph[0] and index != paragraph[0]:
                lines[index].decompose()
            
        else:
            paragraph[1].extend(lines[index].contents)
            clear_cache()
        
        # Important!
        index += 1

    # handle last paragraph
    if paragraph[0]:
        clear_cache()

综上，整体的代码为：

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Author: Shadow Cheng

"""

import os
import zipfile
from bs4 import BeautifulSoup


def load_document(file, label_name):
    # load html or xml file, return DOM root and all node named [label_name]
    with open(file, 'r') as f:
        soup = BeautifulSoup(f, "html.parser")
        body = soup.findChildren(name=label_name)[0]

        lines = [x for x in body.children if x != '\n']

        return (soup, lines)


def get_toc():
    # Fixed, if re-format other epub file,
    # check whether the toc file named "toc.ncx"
    #
    with open("data/src/toc.ncx", 'r') as f:
        soup = BeautifulSoup(f, "html.parser")
        ncx = soup.findChildren(name='navpoint')

        toc = []
        for item in ncx:
            text = ""
            src = ""
            for s in item.children:
                if s.name == "navlabel":
                    text = s.text.strip()
                elif s.name == "content":
                    src = s.attrs['src']
            toc.append([text, src])

    return toc


def is_toc_prefix(toc, line):
    # whether line is title of content, return (flag-1, flag-2)
    # flag-1 indicates whether line in title
    # flag-2 indicates whether line is completed title.

    if len(line) == 0:
        return (False, False, "")
    for title in toc:
        if (line.replace(' ', '') in title.replace(' ', '')):
            if len(line) == len(title):
                return (True, True, title)
            else:
                return (True, False, title)
    return (False, False, title)


def delete_ad(lines, ad_string):
    for line in lines:
        if line.string == ad_string:
            line.decompose()


def handle_lines(lines):
    # re-format lines, merget splited lines into one line.
    index = 0
    toc = [x[0] for x in get_toc()]

    paragraph = [0, []]

    # clear cache lines
    def clear_cache():

            if paragraph[0]:
                lines[paragraph[0]].string = ""
                for tag in paragraph[1]:
                    lines[paragraph[0]].append(tag)
                paragraph[0] = 0
            paragraph[1] = []

    while (index < len(lines)):
        line = ''.join(list(lines[index].stripped_strings))

        # merge splited title
        (is_title, completed, title) = is_toc_prefix(toc, line)
        if is_title and not completed:
            t = [x for x in lines[index].find_all(name="a") if x.string]
            if t:
                t[0].string = title           
            lines[index + 1].decompose()
            index += 2
            clear_cache()
            continue

        elif is_title and completed:
            index += 1
            clear_cache()
            continue
            

        # single line contain img, omit:
        if lines[index].img:
            index += 1
            clear_cache()
            continue

        # merge splited normal lines
        if (25 < len(line)):
            
            if not paragraph[0]:
                paragraph[0] = index

            # whether single space between every char?
            if line.count(' ')/len(line) > 0.4 and "  " not in line:
                if not lines[index].string:
                    # may have tag <a> in lines[index]
                    # handle this situation
                    t = [x for x in lines[index].contents if x.string]
                    t[0].string = line.replace(' ', '')
                else:
                    lines[index].string = line.replace(' ', '')

            paragraph[1].extend(lines[index].contents)
            if not paragraph[0] and index != paragraph[0]:
                lines[index].decompose()
            
        else:
            paragraph[1].extend(lines[index].contents)
            clear_cache()
        
        # Important!
        index += 1

    # handle last paragraph
    if paragraph[0]:
        clear_cache()


def handle_document(file):
    # load file, get root and body
    (soup, lines) = load_document(file, "body")

    # define ad string
    ad_string = "更多电子书资料请搜索「书行天下」：http://www.sxpdf.com"
    delete_ad(lines, ad_string)

    # merge splited lines
    handle_lines(lines)

    for line in lines:
        if not len(line.contents):
            line.decompose()

    return soup.prettify()


def extract_epub(file, target_dir):
    # extract epub file into appointed dir
    with zipfile.ZipFile(file, 'r') as myzip:
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        myzip.extractall(target_dir)
        print(f"Excart file into [{target_dir}]")


def creat_epub(target_dir, dst_name, wipe_prefix, log=False):
    # create epub file
    # walk around target dir, and push every file into epub.
    with zipfile.ZipFile(dst_name, 'w') as target:
        for i in os.walk(target_dir, topdown=False):
            for n in i[2]:
                file_path = os.path.normpath(''.join((i[0], "/", n)))
                file_path_in_zip = file_path.replace(wipe_prefix, '')
                target.write(file_path, file_path_in_zip)
                if log:
                    print(f"Write file:{file_path}->{file_path_in_zip}")


def main():
    # extract epub file into appointed dir
    extract_epub("data/src.epub", "data/src")

    # format all the .html file.
    file_list = [x for x in os.listdir("data/src") if ".html" in x]
    for file in file_list:
        print(f"Handle file: {file}")
        res = handle_document("data/src/" + file)
        with open("data/src/" + file, 'w') as f:
            f.writelines(res)

    # packaging target dir into epub file.
    creat_epub("data/src", "data/dst.epub", "data/src/", log=False)
    print("Create target File.")


if __name__ == "__main__":
    main()

总结：

1. BS4 库对于 html 文档的处理挺有意思，值得一试；
2. Os库自带的`walk`、`makedirs`函数很方便，自己想要实现也不难，留待后续有时间的时候练手吧。