mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 18:32:15 +00:00
170 lines
5.0 KiB
Python
170 lines
5.0 KiB
Python
"""
|
|
NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
|
|
"""
|
|
|
|
import warnings
|
|
|
|
from bleach import ALLOWED_ATTRIBUTES, ALLOWED_TAGS, clean
|
|
from traitlets import Any, Bool, List, Set, Unicode
|
|
|
|
_USE_BLEACH_CSS_SANITIZER = False
|
|
_USE_BLEACH_STYLES = False
|
|
|
|
|
|
try:
|
|
# bleach[css] >=5.0
|
|
from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES as ALLOWED_STYLES
|
|
from bleach.css_sanitizer import CSSSanitizer
|
|
|
|
_USE_BLEACH_CSS_SANITIZER = True
|
|
_USE_BLEACH_STYLES = False
|
|
except ImportError:
|
|
try:
|
|
# bleach <5
|
|
from bleach import ALLOWED_STYLES
|
|
|
|
_USE_BLEACH_CSS_SANITIZER = False
|
|
_USE_BLEACH_STYLES = True
|
|
warnings.warn(
|
|
"Support for bleach <5 will be removed in a future version of nbconvert",
|
|
DeprecationWarning,
|
|
)
|
|
|
|
except ImportError:
|
|
warnings.warn(
|
|
"The installed bleach/tinycss2 do not provide CSS sanitization, "
|
|
"please upgrade to bleach >=5",
|
|
UserWarning,
|
|
)
|
|
|
|
|
|
from .base import Preprocessor
|
|
|
|
__all__ = ["SanitizeHTML"]
|
|
|
|
|
|
class SanitizeHTML(Preprocessor):
|
|
|
|
# Bleach config.
|
|
attributes = Any(
|
|
config=True,
|
|
default_value=ALLOWED_ATTRIBUTES,
|
|
help="Allowed HTML tag attributes",
|
|
)
|
|
tags = List(
|
|
Unicode(),
|
|
config=True,
|
|
default_value=ALLOWED_TAGS,
|
|
help="List of HTML tags to allow",
|
|
)
|
|
styles = List(
|
|
Unicode(),
|
|
config=True,
|
|
default_value=ALLOWED_STYLES,
|
|
help="Allowed CSS styles if <style> tag is allowed",
|
|
)
|
|
strip = Bool(
|
|
config=True,
|
|
default_value=False,
|
|
help="If True, remove unsafe markup entirely instead of escaping",
|
|
)
|
|
strip_comments = Bool(
|
|
config=True,
|
|
default_value=True,
|
|
help="If True, strip comments from escaped HTML",
|
|
)
|
|
|
|
# Display data config.
|
|
safe_output_keys = Set(
|
|
config=True,
|
|
default_value={
|
|
"metadata", # Not a mimetype per-se, but expected and safe.
|
|
"text/plain",
|
|
"text/latex",
|
|
"application/json",
|
|
"image/png",
|
|
"image/jpeg",
|
|
},
|
|
help="Cell output mimetypes to render without modification",
|
|
)
|
|
sanitized_output_types = Set(
|
|
config=True,
|
|
default_value={
|
|
"text/html",
|
|
"text/markdown",
|
|
},
|
|
help="Cell output types to display after escaping with Bleach.",
|
|
)
|
|
|
|
def preprocess_cell(self, cell, resources, cell_index):
|
|
"""
|
|
Sanitize potentially-dangerous contents of the cell.
|
|
|
|
Cell Types:
|
|
raw:
|
|
Sanitize literal HTML
|
|
markdown:
|
|
Sanitize literal HTML
|
|
code:
|
|
Sanitize outputs that could result in code execution
|
|
"""
|
|
if cell.cell_type == "raw":
|
|
# Sanitize all raw cells anyway.
|
|
# Only ones with the text/html mimetype should be emitted
|
|
# but erring on the side of safety maybe.
|
|
cell.source = self.sanitize_html_tags(cell.source)
|
|
return cell, resources
|
|
elif cell.cell_type == "markdown":
|
|
cell.source = self.sanitize_html_tags(cell.source)
|
|
return cell, resources
|
|
elif cell.cell_type == "code":
|
|
cell.outputs = self.sanitize_code_outputs(cell.outputs)
|
|
return cell, resources
|
|
|
|
def sanitize_code_outputs(self, outputs):
|
|
"""
|
|
Sanitize code cell outputs.
|
|
|
|
Removes 'text/javascript' fields from display_data outputs, and
|
|
runs `sanitize_html_tags` over 'text/html'.
|
|
"""
|
|
for output in outputs:
|
|
# These are always ascii, so nothing to escape.
|
|
if output["output_type"] in ("stream", "error"):
|
|
continue
|
|
data = output.data
|
|
to_remove = []
|
|
for key in data:
|
|
if key in self.safe_output_keys:
|
|
continue
|
|
elif key in self.sanitized_output_types:
|
|
self.log.info("Sanitizing %s" % key)
|
|
data[key] = self.sanitize_html_tags(data[key])
|
|
else:
|
|
# Mark key for removal. (Python doesn't allow deletion of
|
|
# keys from a dict during iteration)
|
|
to_remove.append(key)
|
|
for key in to_remove:
|
|
self.log.info("Removing %s" % key)
|
|
del data[key]
|
|
return outputs
|
|
|
|
def sanitize_html_tags(self, html_str):
|
|
"""
|
|
Sanitize a string containing raw HTML tags.
|
|
"""
|
|
kwargs = dict(
|
|
tags=self.tags,
|
|
attributes=self.attributes,
|
|
strip=self.strip,
|
|
strip_comments=self.strip_comments,
|
|
)
|
|
|
|
if _USE_BLEACH_CSS_SANITIZER:
|
|
css_sanitizer = CSSSanitizer(allowed_css_properties=self.styles)
|
|
kwargs.update(css_sanitizer=css_sanitizer)
|
|
elif _USE_BLEACH_STYLES:
|
|
kwargs.update(styles=self.styles)
|
|
|
|
return clean(html_str, **kwargs)
|