From 3618c2617006f1c6fc08f88f04b122fe34406185 Mon Sep 17 00:00:00 2001 From: Daniel Bosk Date: Tue, 17 Dec 2024 13:21:27 +0100 Subject: [PATCH] Converts PDFs to text using pdf2txt --- src/canvaslms/cli/submissions.nw | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/canvaslms/cli/submissions.nw b/src/canvaslms/cli/submissions.nw index a20be10..6e0eae6 100644 --- a/src/canvaslms/cli/submissions.nw +++ b/src/canvaslms/cli/submissions.nw @@ -905,6 +905,7 @@ def convert_to_md(attachment: canvasapi.file.File, <> content_type = getattr(attachment, "content-type") <> + <> <> <>= contents = convert_to_md(attachment, tmpdir) @@ -951,6 +952,8 @@ def text_to_md(content_type): This leaves us with the following. The advantage of reading the content from the file is that Python will solve the encoding for us. +Instead of using an [[if]] statement, we'll go all Python and use a +[[try-except]] block. <>= try: md_type = text_to_md(content_type) @@ -961,7 +964,22 @@ except ValueError: pass @ -If the content type is not text, we use [[pypandoc]] to convert it to Markdown. +Now we'll do the same for PDF files. +We'll use [[pdf2txt]] to convert the PDF to text. +However, here we'll use an if statement. +We'll check for the content type to end with [[pdf]], that will capture also +[[x-pdf]]. +<>= +if content_type.endswith("pdf"): + try: + return subprocess.check_output(["pdf2txt", str(outfile)], + text=True) + except subprocess.CalledProcessError: + pass +@ + +Finally, as a last attempt, we use [[pypandoc]] to try to convert it to +Markdown. Here we'll use Pandoc's ability to infer the file type on its own. This means we'll have to download the attachment as a file in a temporary location and let Pandoc convert the file to Markdown. @@ -969,7 +987,7 @@ location and let Pandoc convert the file to Markdown. try: return pypandoc.convert_file(outfile, "markdown") except Exception as err: - return f"Pandoc cannot convert this file. " \ + return f"Cannot convert this file. " \ f"The file is located at\n\n {outfile}\n\n" @