Support Chinese characters in PDF testing
Here's the example test:
https://github.com/seleniumbase/SeleniumBase/blob/master/examples/test_chinese_pdf.py
# -*- coding: utf-8 -*-
from seleniumbase import BaseCase
class ChinesePdfTestClass(BaseCase):
def test_chinese_pdf(self):
pdf = ('https://github.com/seleniumbase/SeleniumBase/'
'files/3895614/unittest.pdf')
# Get and print PDF text
pdf_text = self.get_pdf_text(pdf, page=2)
print("\n" + pdf_text)
# Assert PDF contains the expected text on Page 2
self.assert_pdf_text(pdf, "个测试类", page=2)
# Assert PDF contains the expected text on any of the pages
self.assert_pdf_text(pdf, "运行单元测试")
self.assert_pdf_text(pdf, "等待测试结束后显示所有结果")
self.assert_pdf_text(pdf, "测试的执行跟方法的顺序没有关系")
- Updated methods:
def get_pdf_text(self, pdf, page=None, maxpages=None,
password=None, codec='utf-8', wrap=False, nav=False,
override=False):
""" Gets text from a PDF file.
PDF can be either a URL or a file path on the local file system.
@Params
pdf - The URL or file path of the PDF file.
page - The page number (or a list of page numbers) of the PDF.
If a page number is provided, looks only at that page.
(1 is the first page, 2 is the second page, etc.)
If no page number is provided, returns all PDF text.
maxpages - Instead of providing a page number, you can provide
the number of pages to use from the beginning.
password - If the PDF is password-protected, enter it here.
codec - The compression format for character encoding.
(The default codec used by this method is 'utf-8'.)
wrap - Replaces ' \n' with ' ' so that individual sentences
from a PDF don't get broken up into seperate lines when
getting converted into text format.
nav - If PDF is a URL, navigates to the URL in the browser first.
(Not needed because the PDF will be downloaded anyway.)
override - If the PDF file to be downloaded already exists in the
downloaded_files/ folder, that PDF will be used
instead of downloading it again. """
def assert_pdf_text(self, pdf, text, page=None, maxpages=None,
password=None, codec='utf-8', wrap=True, nav=False,
override=False):
""" Asserts text in a PDF file.
PDF can be either a URL or a file path on the local file system.
@Params
pdf - The URL or file path of the PDF file.
text - The expected text to verify in the PDF.
page - The page number of the PDF to use (optional).
If a page number is provided, looks only at that page.
(1 is the first page, 2 is the second page, etc.)
If no page number is provided, looks at all the pages.
maxpages - Instead of providing a page number, you can provide
the number of pages to use from the beginning.
password - If the PDF is password-protected, enter it here.
codec - The compression format for character encoding.
(The default codec used by this method is 'utf-8'.)
wrap - Replaces ' \n' with ' ' so that individual sentences
from a PDF don't get broken up into seperate lines when
getting converted into text format.
nav - If PDF is a URL, navigates to the URL in the browser first.
(Not needed because the PDF will be downloaded anyway.)
override - If the PDF file to be downloaded already exists in the
downloaded_files/ folder, that PDF will be used
instead of downloading it again. """