From fc0551f7dcf2880dcbd97b0cbec0a93d1ec78fcd Mon Sep 17 00:00:00 2001 From: Aykhan Date: Wed, 13 Sep 2023 23:46:11 +0400 Subject: [PATCH] Added BeautifulSoup to html2text function --- src/app/utils/custom_functions.py | 9 +++------ src/poetry.lock | 31 ++++++++++++++++++++++++++++++- src/pyproject.toml | 1 + 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/app/utils/custom_functions.py b/src/app/utils/custom_functions.py index 90832a0..51188fe 100644 --- a/src/app/utils/custom_functions.py +++ b/src/app/utils/custom_functions.py @@ -1,9 +1,6 @@ -import re +from bs4 import BeautifulSoup def html2text(html: str) -> str: - return re.sub( - re.compile('<.*?>'), - '', - html - ) \ No newline at end of file + soup = BeautifulSoup(html, 'html.parser') + return soup.get_text() \ No newline at end of file diff --git a/src/poetry.lock b/src/poetry.lock index 32a499d..524b1ce 100644 --- a/src/poetry.lock +++ b/src/poetry.lock @@ -130,6 +130,24 @@ files = [ docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] test = ["flake8 (>=5.0,<6.0)", "uvloop (>=0.15.3)"] +[[package]] +name = "beautifulsoup4" +version = "4.12.2" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "blinker" version = "1.6.2" @@ -975,6 +993,17 @@ files = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + [[package]] name = "sqlalchemy" version = "2.0.20" @@ -1114,4 +1143,4 @@ standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "333a9be3573863da41b6a2615cd28aa3a6daf2e608a76da1476af72d343dae52" +content-hash = "78fafe7f9e40a699277d899f9b66a4bc2b9d191008a5fcd6bd393462d51d473f" diff --git a/src/pyproject.toml b/src/pyproject.toml index cbc95aa..27e0401 100644 --- a/src/pyproject.toml +++ b/src/pyproject.toml @@ -23,6 +23,7 @@ pillow = "^10.0.0" aiofiles = "^23.2.1" python-jose = {extras = ["cryptography"], version = "^3.3.0"} fastapi-mail = "^1.4.1" +beautifulsoup4 = "^4.12.2" [build-system]