Source code for pdf2image.pdf2image

"""
    pdf2image is a light wrapper for the poppler-utils tools that can convert your
    PDFs into Pillow images.
"""

import os
import platform
import tempfile
import types
import shutil
import subprocess
from subprocess import Popen, PIPE, TimeoutExpired
from typing import Any, Union, Tuple, List, Dict, Callable
from pathlib import PurePath
from PIL import Image

from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator

from pdf2image.parsers import (
    parse_buffer_to_pgm,
    parse_buffer_to_ppm,
    parse_buffer_to_jpeg,
    parse_buffer_to_png,
)

from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError,
    PDFPopplerTimeoutError,
)

TRANSPARENT_FILE_TYPES = ["png", "tiff"]
PDFINFO_CONVERT_TO_INT = ["Pages"]


[docs]def convert_from_path(
    pdf_path: Union[str, PurePath],
    dpi: int = 200,
    output_folder: Union[str, PurePath] = None,
    first_page: int = None,
    last_page: int = None,
    fmt: str = "ppm",
    jpegopt: Dict = None,
    thread_count: int = 1,
    userpw: str = None,
    ownerpw: str = None,
    use_cropbox: bool = False,
    strict: bool = False,
    transparent: bool = False,
    single_file: bool = False,
    output_file: Any = uuid_generator(),
    poppler_path: Union[str, PurePath] = None,
    grayscale: bool = False,
    size: Union[Tuple, int] = None,
    paths_only: bool = False,
    use_pdftocairo: bool = False,
    timeout: int = None,
    hide_annotations: bool = False,
) -> List[Image.Image]:
    """Function wrapping pdftoppm and pdftocairo

    :param pdf_path: Path to the PDF that you want to convert
    :type pdf_path: Union[str, PurePath]
    :param dpi: Image quality in DPI (default 200), defaults to 200
    :type dpi: int, optional
    :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
    :type output_folder: Union[str, PurePath], optional
    :param first_page: First page to process, defaults to None
    :type first_page: int, optional
    :param last_page: Last page to process before stopping, defaults to None
    :type last_page: int, optional
    :param fmt: Output image format, defaults to "ppm"
    :type fmt: str, optional
    :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
    :type jpegopt: Dict, optional
    :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
    :type thread_count: int, optional
    :param userpw: PDF's password, defaults to None
    :type userpw: str, optional
    :param ownerpw: PDF's owner password, defaults to None
    :type ownerpw: str, optional
    :param use_cropbox: Use cropbox instead of mediabox, defaults to False
    :type use_cropbox: bool, optional
    :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
    :type strict: bool, optional
    :param transparent: Output with a transparent background instead of a white one, defaults to False
    :type transparent: bool, optional
    :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
    :type single_file: bool, optional
    :param output_file: What is the output filename or generator, defaults to uuid_generator()
    :type output_file: Any, optional
    :param poppler_path: Path to look for poppler binaries, defaults to None
    :type poppler_path: Union[str, PurePath], optional
    :param grayscale: Output grayscale image(s), defaults to False
    :type grayscale: bool, optional
    :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
    :type size: Union[Tuple, int], optional
    :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
    :type paths_only: bool, optional
    :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
    :type use_pdftocairo: bool, optional
    :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
    :type timeout: int, optional
    :param hide_annotations: Hide PDF annotations in the output, defaults to False
    :type hide_annotations: bool, optional
    :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
    :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
    :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
    :return: A list of Pillow images, one for each page between first_page and last_page
    :rtype: List[Image.Image]
    """

    if use_pdftocairo and fmt == "ppm":
        fmt = "png"

    # We make sure that if passed arguments are Path objects, they're converted to strings
    if isinstance(pdf_path, PurePath):
        pdf_path = pdf_path.as_posix()

    if isinstance(output_folder, PurePath):
        output_folder = output_folder.as_posix()

    if isinstance(poppler_path, PurePath):
        poppler_path = poppler_path.as_posix()

    page_count = pdfinfo_from_path(
        pdf_path, userpw, ownerpw, poppler_path=poppler_path
    )["Pages"]

    # We start by getting the output format, the buffer processing function and if we need pdftocairo
    parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
        fmt, grayscale
    )

    # We use pdftocairo is the format requires it OR we need a transparent output
    use_pdfcairo = (
        use_pdftocairo
        or use_pdfcairo_format
        or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
    )

    poppler_version_major, poppler_version_minor = _get_poppler_version(
        "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
    )

    if poppler_version_major == 0 and poppler_version_minor <= 57:
        jpegopt = None

    if poppler_version_major == 0 and poppler_version_minor <= 83:
        hide_annotations = False

    # If output_file isn't a generator, it will be turned into one
    if not isinstance(output_file, types.GeneratorType) and not isinstance(
        output_file, ThreadSafeGenerator
    ):
        if single_file:
            output_file = iter([output_file])
        else:
            output_file = counter_generator(output_file)

    if thread_count < 1:
        thread_count = 1

    if first_page is None or first_page < 1:
        first_page = 1

    if last_page is None or last_page > page_count:
        last_page = page_count

    if first_page > last_page:
        return []

    try:
        auto_temp_dir = False
        if output_folder is None and use_pdfcairo:
            output_folder = tempfile.mkdtemp()
            auto_temp_dir = True

        # Recalculate page count based on first and last page
        page_count = last_page - first_page + 1

        if thread_count > page_count:
            thread_count = page_count

        reminder = page_count % thread_count
        current_page = first_page
        processes = []
        for _ in range(thread_count):
            thread_output_file = next(output_file)

            # Get the number of pages the thread will be processing
            thread_page_count = page_count // thread_count + int(reminder > 0)
            # Build the command accordingly
            args = _build_command(
                ["-r", str(dpi), pdf_path],
                output_folder,
                current_page,
                current_page + thread_page_count - 1,
                parsed_fmt,
                jpegopt,
                thread_output_file,
                userpw,
                ownerpw,
                use_cropbox,
                transparent,
                single_file,
                grayscale,
                size,
                hide_annotations,
            )

            if use_pdfcairo:
                if hide_annotations:
                    raise NotImplementedError(
                        "Hide annotations flag not implemented in pdftocairo."
                    )
                args = [_get_command_path("pdftocairo", poppler_path)] + args
            else:
                args = [_get_command_path("pdftoppm", poppler_path)] + args

            # Update page values
            current_page = current_page + thread_page_count
            reminder -= int(reminder > 0)
            # Add poppler path to LD_LIBRARY_PATH
            env = os.environ.copy()
            if poppler_path is not None:
                env["LD_LIBRARY_PATH"] = (
                    poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
                )
            # Spawn the process and save its uuid
            startupinfo = None
            if platform.system() == "Windows":
                # this startupinfo structure prevents a console window from popping up on Windows
                startupinfo = subprocess.STARTUPINFO()
                startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
            processes.append(
                (
                    thread_output_file,
                    Popen(
                        args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo
                    ),
                )
            )

        images = []

        for uid, proc in processes:
            try:
                data, err = proc.communicate(timeout=timeout)
            except TimeoutExpired:
                proc.kill()
                outs, errs = proc.communicate()
                raise PDFPopplerTimeoutError("Run poppler timeout.")

            if b"Syntax Error" in err and strict:
                raise PDFSyntaxError(err.decode("utf8", "ignore"))

            if output_folder is not None:
                images += _load_from_output_folder(
                    output_folder,
                    uid,
                    final_extension,
                    paths_only,
                    in_memory=auto_temp_dir,
                )
            else:
                images += parse_buffer_func(data)
    finally:
        if auto_temp_dir:
            shutil.rmtree(output_folder)

    return images


[docs]def convert_from_bytes(
    pdf_file: bytes,
    dpi: int = 200,
    output_folder: Union[str, PurePath] = None,
    first_page: int = None,
    last_page: int = None,
    fmt: str = "ppm",
    jpegopt: Dict = None,
    thread_count: int = 1,
    userpw: str = None,
    ownerpw: str = None,
    use_cropbox: bool = False,
    strict: bool = False,
    transparent: bool = False,
    single_file: bool = False,
    output_file: Union[str, PurePath] = uuid_generator(),
    poppler_path: Union[str, PurePath] = None,
    grayscale: bool = False,
    size: Union[Tuple, int] = None,
    paths_only: bool = False,
    use_pdftocairo: bool = False,
    timeout: int = None,
    hide_annotations: bool = False,
) -> List[Image.Image]:
    """Function wrapping pdftoppm and pdftocairo.

    :param pdf_bytes: Bytes of the PDF that you want to convert
    :type pdf_bytes: bytes
    :param dpi: Image quality in DPI (default 200), defaults to 200
    :type dpi: int, optional
    :param output_folder: Write the resulting images to a folder (instead of directly in memory), defaults to None
    :type output_folder: Union[str, PurePath], optional
    :param first_page: First page to process, defaults to None
    :type first_page: int, optional
    :param last_page: Last page to process before stopping, defaults to None
    :type last_page: int, optional
    :param fmt: Output image format, defaults to "ppm"
    :type fmt: str, optional
    :param jpegopt: jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format), defaults to None
    :type jpegopt: Dict, optional
    :param thread_count: How many threads we are allowed to spawn for processing, defaults to 1
    :type thread_count: int, optional
    :param userpw: PDF's password, defaults to None
    :type userpw: str, optional
    :param ownerpw: PDF's owner password, defaults to None
    :type ownerpw: str, optional
    :param use_cropbox: Use cropbox instead of mediabox, defaults to False
    :type use_cropbox: bool, optional
    :param strict: When a Syntax Error is thrown, it will be raised as an Exception, defaults to False
    :type strict: bool, optional
    :param transparent: Output with a transparent background instead of a white one, defaults to False
    :type transparent: bool, optional
    :param single_file: Uses the -singlefile option from pdftoppm/pdftocairo, defaults to False
    :type single_file: bool, optional
    :param output_file: What is the output filename or generator, defaults to uuid_generator()
    :type output_file: Any, optional
    :param poppler_path: Path to look for poppler binaries, defaults to None
    :type poppler_path: Union[str, PurePath], optional
    :param grayscale: Output grayscale image(s), defaults to False
    :type grayscale: bool, optional
    :param size: Size of the resulting image(s), uses the Pillow (width, height) standard, defaults to None
    :type size: Union[Tuple, int], optional
    :param paths_only: Don't load image(s), return paths instead (requires output_folder), defaults to False
    :type paths_only: bool, optional
    :param use_pdftocairo: Use pdftocairo instead of pdftoppm, may help performance, defaults to False
    :type use_pdftocairo: bool, optional
    :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
    :type timeout: int, optional
    :param hide_annotations: Hide PDF annotations in the output, defaults to False
    :type hide_annotations: bool, optional
    :raises NotImplementedError: Raised when conflicting parameters are given (hide_annotations for pdftocairo)
    :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
    :raises PDFSyntaxError: Raised if there is a syntax error in the PDF and strict=True
    :return: A list of Pillow images, one for each page between first_page and last_page
    :rtype: List[Image.Image]
    """

    fh, temp_filename = tempfile.mkstemp()
    try:
        with open(temp_filename, "wb") as f:
            f.write(pdf_file)
            f.flush()
            return convert_from_path(
                f.name,
                dpi=dpi,
                output_folder=output_folder,
                first_page=first_page,
                last_page=last_page,
                fmt=fmt,
                jpegopt=jpegopt,
                thread_count=thread_count,
                userpw=userpw,
                ownerpw=ownerpw,
                use_cropbox=use_cropbox,
                strict=strict,
                transparent=transparent,
                single_file=single_file,
                output_file=output_file,
                poppler_path=poppler_path,
                grayscale=grayscale,
                size=size,
                paths_only=paths_only,
                use_pdftocairo=use_pdftocairo,
                timeout=timeout,
                hide_annotations=hide_annotations,
            )
    finally:
        os.close(fh)
        os.remove(temp_filename)


def _build_command(
    args: List,
    output_folder: str,
    first_page: int,
    last_page: int,
    fmt: str,
    jpegopt: Dict,
    output_file: str,
    userpw: str,
    ownerpw: str,
    use_cropbox: bool,
    transparent: bool,
    single_file: bool,
    grayscale: bool,
    size: Union[int, Tuple[int, int]],
    hide_annotations: bool,
) -> List[str]:
    if use_cropbox:
        args.append("-cropbox")

    if hide_annotations:
        args.append("-hide-annotations")

    if transparent and fmt in TRANSPARENT_FILE_TYPES:
        args.append("-transp")

    if first_page is not None:
        args.extend(["-f", str(first_page)])

    if last_page is not None:
        args.extend(["-l", str(last_page)])

    if fmt not in ["pgm", "ppm"]:
        args.append("-" + fmt)

    if fmt in ["jpeg", "jpg"] and jpegopt:
        args.extend(["-jpegopt", _parse_jpegopt(jpegopt)])

    if single_file:
        args.append("-singlefile")

    if output_folder is not None:
        args.append(os.path.join(output_folder, output_file))

    if userpw is not None:
        args.extend(["-upw", userpw])

    if ownerpw is not None:
        args.extend(["-opw", ownerpw])

    if grayscale:
        args.append("-gray")

    if size is None:
        pass
    elif isinstance(size, tuple) and len(size) == 2:
        if size[0] is not None:
            args.extend(["-scale-to-x", str(int(size[0]))])
        else:
            args.extend(["-scale-to-x", str(-1)])
        if size[1] is not None:
            args.extend(["-scale-to-y", str(int(size[1]))])
        else:
            args.extend(["-scale-to-y", str(-1)])
    elif isinstance(size, tuple) and len(size) == 1:
        args.extend(["-scale-to", str(int(size[0]))])
    elif isinstance(size, int) or isinstance(size, float):
        args.extend(["-scale-to", str(int(size))])
    else:
        raise ValueError(f"Size {size} is not a tuple or an integer")

    return args


def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable, bool]:
    fmt = fmt.lower()
    if fmt[0] == ".":
        fmt = fmt[1:]
    if fmt in ("jpeg", "jpg"):
        return "jpeg", "jpg", parse_buffer_to_jpeg, False
    if fmt == "png":
        return "png", "png", parse_buffer_to_png, False
    if fmt in ("tif", "tiff"):
        return "tiff", "tif", None, True
    if fmt == "ppm" and grayscale:
        return "pgm", "pgm", parse_buffer_to_pgm, False
    # Unable to parse the format so we'll use the default
    return "ppm", "ppm", parse_buffer_to_ppm, False


def _parse_jpegopt(jpegopt: Dict) -> str:
    parts = []
    for k, v in jpegopt.items():
        if v is True:
            v = "y"
        if v is False:
            v = "n"
        parts.append("{}={}".format(k, v))
    return ",".join(parts)


def _get_command_path(command: str, poppler_path: str = None) -> str:
    if platform.system() == "Windows":
        command = command + ".exe"

    if poppler_path is not None:
        command = os.path.join(poppler_path, command)

    return command


def _get_poppler_version(
    command: str, poppler_path: str = None, timeout: int = None
) -> Tuple[int, int]:
    command = [_get_command_path(command, poppler_path), "-v"]

    env = os.environ.copy()
    if poppler_path is not None:
        env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
    proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)

    try:
        data, err = proc.communicate(timeout=timeout)
    except TimeoutExpired:
        proc.kill()
        outs, errs = proc.communicate()
        raise PDFPopplerTimeoutError("Run poppler poppler timeout.")

    try:
        # TODO: Make this more robust
        version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")
        return int(version[0]), int(version[1])
    except:
        # Lowest version that includes pdftocairo (2011)
        return 0, 17


[docs]def pdfinfo_from_path(
    pdf_path: str,
    userpw: str = None,
    ownerpw: str = None,
    poppler_path: str = None,
    rawdates: bool = False,
    timeout: int = None,
) -> Dict:
    """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.

    :param pdf_path: Path to the PDF that you want to convert
    :type pdf_path: str
    :param userpw: PDF's password, defaults to None
    :type userpw: str, optional
    :param ownerpw: PDF's owner password, defaults to None
    :type ownerpw: str, optional
    :param poppler_path: Path to look for poppler binaries, defaults to None
    :type poppler_path: Union[str, PurePath], optional
    :param rawdates: Return the undecoded data strings, defaults to False
    :type rawdates: bool, optional
    :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
    :type timeout: int, optional
    :raises PDFPopplerTimeoutError: Raised after the timeout for the image processing is exceeded
    :raises PDFInfoNotInstalledError: Raised if pdfinfo is not installed
    :raises PDFPageCountError: Raised if the output could not be parsed
    :return: Dictionary containing various information on the PDF
    :rtype: Dict
    """
    try:
        command = [_get_command_path("pdfinfo", poppler_path), pdf_path]

        if userpw is not None:
            command.extend(["-upw", userpw])

        if ownerpw is not None:
            command.extend(["-opw", ownerpw])

        if rawdates:
            command.extend(["-rawdates"])

        # Add poppler path to LD_LIBRARY_PATH
        env = os.environ.copy()
        if poppler_path is not None:
            env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
        proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)

        try:
            out, err = proc.communicate(timeout=timeout)
        except TimeoutExpired:
            proc.kill()
            outs, errs = proc.communicate()
            raise PDFPopplerTimeoutError("Run poppler poppler timeout.")

        d = {}
        for field in out.decode("utf8", "ignore").split("\n"):
            sf = field.split(":")
            key, value = sf[0], ":".join(sf[1:])
            if key != "":
                d[key] = (
                    int(value.strip())
                    if key in PDFINFO_CONVERT_TO_INT
                    else value.strip()
                )

        if "Pages" not in d:
            raise ValueError

        return d

    except OSError:
        raise PDFInfoNotInstalledError(
            "Unable to get page count. Is poppler installed and in PATH?"
        )
    except ValueError:
        raise PDFPageCountError(
            f"Unable to get page count.\n{err.decode('utf8', 'ignore')}"
        )


[docs]def pdfinfo_from_bytes(
    pdf_bytes: bytes,
    userpw: str = None,
    ownerpw: str = None,
    poppler_path: str = None,
    rawdates: bool = False,
    timeout: int = None,
) -> Dict:
    """Function wrapping poppler's pdfinfo utility and returns the result as a dictionary.

    :param pdf_bytes: Bytes of the PDF that you want to convert
    :type pdf_bytes: bytes
    :param userpw: PDF's password, defaults to None
    :type userpw: str, optional
    :param ownerpw: PDF's owner password, defaults to None
    :type ownerpw: str, optional
    :param poppler_path: Path to look for poppler binaries, defaults to None
    :type poppler_path: Union[str, PurePath], optional
    :param rawdates: Return the undecoded data strings, defaults to False
    :type rawdates: bool, optional
    :param timeout: Raise PDFPopplerTimeoutError after the given time, defaults to None
    :type timeout: int, optional
    :return: Dictionary containing various information on the PDF
    :rtype: Dict
    """
    fh, temp_filename = tempfile.mkstemp()
    try:
        with open(temp_filename, "wb") as f:
            f.write(pdf_bytes)
            f.flush()
        return pdfinfo_from_path(
            temp_filename,
            userpw=userpw,
            ownerpw=ownerpw,
            poppler_path=poppler_path,
            rawdates=rawdates,
            timeout=timeout,
        )
    finally:
        os.close(fh)
        os.remove(temp_filename)


def _load_from_output_folder(
    output_folder: str,
    output_file: str,
    ext: str,
    paths_only: bool,
    in_memory: bool = False,
) -> List[Image.Image]:
    images = []
    for f in sorted(os.listdir(output_folder)):
        if f.startswith(output_file) and f.split(".")[-1] == ext:
            if paths_only:
                images.append(os.path.join(output_folder, f))
            else:
                images.append(Image.open(os.path.join(output_folder, f)))
                if in_memory:
                    images[-1].load()
    return images