mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 10:28:02 +00:00
713 lines
20 KiB
Python
713 lines
20 KiB
Python
from __future__ import absolute_import
|
|
|
|
from numbers import Number
|
|
|
|
from plotly import exceptions, optional_imports
|
|
import plotly.colors as clrs
|
|
from plotly.graph_objs import graph_objs
|
|
from plotly.subplots import make_subplots
|
|
|
|
pd = optional_imports.get_module("pandas")
|
|
np = optional_imports.get_module("numpy")
|
|
scipy_stats = optional_imports.get_module("scipy.stats")
|
|
|
|
|
|
def calc_stats(data):
|
|
"""
|
|
Calculate statistics for use in violin plot.
|
|
"""
|
|
x = np.asarray(data, np.float)
|
|
vals_min = np.min(x)
|
|
vals_max = np.max(x)
|
|
q2 = np.percentile(x, 50, interpolation="linear")
|
|
q1 = np.percentile(x, 25, interpolation="lower")
|
|
q3 = np.percentile(x, 75, interpolation="higher")
|
|
iqr = q3 - q1
|
|
whisker_dist = 1.5 * iqr
|
|
|
|
# in order to prevent drawing whiskers outside the interval
|
|
# of data one defines the whisker positions as:
|
|
d1 = np.min(x[x >= (q1 - whisker_dist)])
|
|
d2 = np.max(x[x <= (q3 + whisker_dist)])
|
|
return {
|
|
"min": vals_min,
|
|
"max": vals_max,
|
|
"q1": q1,
|
|
"q2": q2,
|
|
"q3": q3,
|
|
"d1": d1,
|
|
"d2": d2,
|
|
}
|
|
|
|
|
|
def make_half_violin(x, y, fillcolor="#1f77b4", linecolor="rgb(0, 0, 0)"):
|
|
"""
|
|
Produces a sideways probability distribution fig violin plot.
|
|
"""
|
|
text = [
|
|
"(pdf(y), y)=(" + "{:0.2f}".format(x[i]) + ", " + "{:0.2f}".format(y[i]) + ")"
|
|
for i in range(len(x))
|
|
]
|
|
|
|
return graph_objs.Scatter(
|
|
x=x,
|
|
y=y,
|
|
mode="lines",
|
|
name="",
|
|
text=text,
|
|
fill="tonextx",
|
|
fillcolor=fillcolor,
|
|
line=graph_objs.scatter.Line(width=0.5, color=linecolor, shape="spline"),
|
|
hoverinfo="text",
|
|
opacity=0.5,
|
|
)
|
|
|
|
|
|
def make_violin_rugplot(vals, pdf_max, distance, color="#1f77b4"):
|
|
"""
|
|
Returns a rugplot fig for a violin plot.
|
|
"""
|
|
return graph_objs.Scatter(
|
|
y=vals,
|
|
x=[-pdf_max - distance] * len(vals),
|
|
marker=graph_objs.scatter.Marker(color=color, symbol="line-ew-open"),
|
|
mode="markers",
|
|
name="",
|
|
showlegend=False,
|
|
hoverinfo="y",
|
|
)
|
|
|
|
|
|
def make_non_outlier_interval(d1, d2):
|
|
"""
|
|
Returns the scatterplot fig of most of a violin plot.
|
|
"""
|
|
return graph_objs.Scatter(
|
|
x=[0, 0],
|
|
y=[d1, d2],
|
|
name="",
|
|
mode="lines",
|
|
line=graph_objs.scatter.Line(width=1.5, color="rgb(0,0,0)"),
|
|
)
|
|
|
|
|
|
def make_quartiles(q1, q3):
|
|
"""
|
|
Makes the upper and lower quartiles for a violin plot.
|
|
"""
|
|
return graph_objs.Scatter(
|
|
x=[0, 0],
|
|
y=[q1, q3],
|
|
text=[
|
|
"lower-quartile: " + "{:0.2f}".format(q1),
|
|
"upper-quartile: " + "{:0.2f}".format(q3),
|
|
],
|
|
mode="lines",
|
|
line=graph_objs.scatter.Line(width=4, color="rgb(0,0,0)"),
|
|
hoverinfo="text",
|
|
)
|
|
|
|
|
|
def make_median(q2):
|
|
"""
|
|
Formats the 'median' hovertext for a violin plot.
|
|
"""
|
|
return graph_objs.Scatter(
|
|
x=[0],
|
|
y=[q2],
|
|
text=["median: " + "{:0.2f}".format(q2)],
|
|
mode="markers",
|
|
marker=dict(symbol="square", color="rgb(255,255,255)"),
|
|
hoverinfo="text",
|
|
)
|
|
|
|
|
|
def make_XAxis(xaxis_title, xaxis_range):
|
|
"""
|
|
Makes the x-axis for a violin plot.
|
|
"""
|
|
xaxis = graph_objs.layout.XAxis(
|
|
title=xaxis_title,
|
|
range=xaxis_range,
|
|
showgrid=False,
|
|
zeroline=False,
|
|
showline=False,
|
|
mirror=False,
|
|
ticks="",
|
|
showticklabels=False,
|
|
)
|
|
return xaxis
|
|
|
|
|
|
def make_YAxis(yaxis_title):
|
|
"""
|
|
Makes the y-axis for a violin plot.
|
|
"""
|
|
yaxis = graph_objs.layout.YAxis(
|
|
title=yaxis_title,
|
|
showticklabels=True,
|
|
autorange=True,
|
|
ticklen=4,
|
|
showline=True,
|
|
zeroline=False,
|
|
showgrid=False,
|
|
mirror=False,
|
|
)
|
|
return yaxis
|
|
|
|
|
|
def violinplot(vals, fillcolor="#1f77b4", rugplot=True):
|
|
"""
|
|
Refer to FigureFactory.create_violin() for docstring.
|
|
"""
|
|
vals = np.asarray(vals, np.float)
|
|
# summary statistics
|
|
vals_min = calc_stats(vals)["min"]
|
|
vals_max = calc_stats(vals)["max"]
|
|
q1 = calc_stats(vals)["q1"]
|
|
q2 = calc_stats(vals)["q2"]
|
|
q3 = calc_stats(vals)["q3"]
|
|
d1 = calc_stats(vals)["d1"]
|
|
d2 = calc_stats(vals)["d2"]
|
|
|
|
# kernel density estimation of pdf
|
|
pdf = scipy_stats.gaussian_kde(vals)
|
|
# grid over the data interval
|
|
xx = np.linspace(vals_min, vals_max, 100)
|
|
# evaluate the pdf at the grid xx
|
|
yy = pdf(xx)
|
|
max_pdf = np.max(yy)
|
|
# distance from the violin plot to rugplot
|
|
distance = (2.0 * max_pdf) / 10 if rugplot else 0
|
|
# range for x values in the plot
|
|
plot_xrange = [-max_pdf - distance - 0.1, max_pdf + 0.1]
|
|
plot_data = [
|
|
make_half_violin(-yy, xx, fillcolor=fillcolor),
|
|
make_half_violin(yy, xx, fillcolor=fillcolor),
|
|
make_non_outlier_interval(d1, d2),
|
|
make_quartiles(q1, q3),
|
|
make_median(q2),
|
|
]
|
|
if rugplot:
|
|
plot_data.append(
|
|
make_violin_rugplot(vals, max_pdf, distance=distance, color=fillcolor)
|
|
)
|
|
return plot_data, plot_xrange
|
|
|
|
|
|
def violin_no_colorscale(
|
|
data,
|
|
data_header,
|
|
group_header,
|
|
colors,
|
|
use_colorscale,
|
|
group_stats,
|
|
rugplot,
|
|
sort,
|
|
height,
|
|
width,
|
|
title,
|
|
):
|
|
"""
|
|
Refer to FigureFactory.create_violin() for docstring.
|
|
|
|
Returns fig for violin plot without colorscale.
|
|
|
|
"""
|
|
|
|
# collect all group names
|
|
group_name = []
|
|
for name in data[group_header]:
|
|
if name not in group_name:
|
|
group_name.append(name)
|
|
if sort:
|
|
group_name.sort()
|
|
|
|
gb = data.groupby([group_header])
|
|
L = len(group_name)
|
|
|
|
fig = make_subplots(
|
|
rows=1, cols=L, shared_yaxes=True, horizontal_spacing=0.025, print_grid=False
|
|
)
|
|
color_index = 0
|
|
for k, gr in enumerate(group_name):
|
|
vals = np.asarray(gb.get_group(gr)[data_header], np.float)
|
|
if color_index >= len(colors):
|
|
color_index = 0
|
|
plot_data, plot_xrange = violinplot(
|
|
vals, fillcolor=colors[color_index], rugplot=rugplot
|
|
)
|
|
layout = graph_objs.Layout()
|
|
|
|
for item in plot_data:
|
|
fig.append_trace(item, 1, k + 1)
|
|
color_index += 1
|
|
|
|
# add violin plot labels
|
|
fig["layout"].update(
|
|
{"xaxis{}".format(k + 1): make_XAxis(group_name[k], plot_xrange)}
|
|
)
|
|
|
|
# set the sharey axis style
|
|
fig["layout"].update({"yaxis{}".format(1): make_YAxis("")})
|
|
fig["layout"].update(
|
|
title=title,
|
|
showlegend=False,
|
|
hovermode="closest",
|
|
autosize=False,
|
|
height=height,
|
|
width=width,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def violin_colorscale(
|
|
data,
|
|
data_header,
|
|
group_header,
|
|
colors,
|
|
use_colorscale,
|
|
group_stats,
|
|
rugplot,
|
|
sort,
|
|
height,
|
|
width,
|
|
title,
|
|
):
|
|
"""
|
|
Refer to FigureFactory.create_violin() for docstring.
|
|
|
|
Returns fig for violin plot with colorscale.
|
|
|
|
"""
|
|
|
|
# collect all group names
|
|
group_name = []
|
|
for name in data[group_header]:
|
|
if name not in group_name:
|
|
group_name.append(name)
|
|
if sort:
|
|
group_name.sort()
|
|
|
|
# make sure all group names are keys in group_stats
|
|
for group in group_name:
|
|
if group not in group_stats:
|
|
raise exceptions.PlotlyError(
|
|
"All values/groups in the index "
|
|
"column must be represented "
|
|
"as a key in group_stats."
|
|
)
|
|
|
|
gb = data.groupby([group_header])
|
|
L = len(group_name)
|
|
|
|
fig = make_subplots(
|
|
rows=1, cols=L, shared_yaxes=True, horizontal_spacing=0.025, print_grid=False
|
|
)
|
|
|
|
# prepare low and high color for colorscale
|
|
lowcolor = clrs.color_parser(colors[0], clrs.unlabel_rgb)
|
|
highcolor = clrs.color_parser(colors[1], clrs.unlabel_rgb)
|
|
|
|
# find min and max values in group_stats
|
|
group_stats_values = []
|
|
for key in group_stats:
|
|
group_stats_values.append(group_stats[key])
|
|
|
|
max_value = max(group_stats_values)
|
|
min_value = min(group_stats_values)
|
|
|
|
for k, gr in enumerate(group_name):
|
|
vals = np.asarray(gb.get_group(gr)[data_header], np.float)
|
|
|
|
# find intermediate color from colorscale
|
|
intermed = (group_stats[gr] - min_value) / (max_value - min_value)
|
|
intermed_color = clrs.find_intermediate_color(lowcolor, highcolor, intermed)
|
|
|
|
plot_data, plot_xrange = violinplot(
|
|
vals, fillcolor="rgb{}".format(intermed_color), rugplot=rugplot
|
|
)
|
|
layout = graph_objs.Layout()
|
|
|
|
for item in plot_data:
|
|
fig.append_trace(item, 1, k + 1)
|
|
fig["layout"].update(
|
|
{"xaxis{}".format(k + 1): make_XAxis(group_name[k], plot_xrange)}
|
|
)
|
|
# add colorbar to plot
|
|
trace_dummy = graph_objs.Scatter(
|
|
x=[0],
|
|
y=[0],
|
|
mode="markers",
|
|
marker=dict(
|
|
size=2,
|
|
cmin=min_value,
|
|
cmax=max_value,
|
|
colorscale=[[0, colors[0]], [1, colors[1]]],
|
|
showscale=True,
|
|
),
|
|
showlegend=False,
|
|
)
|
|
fig.append_trace(trace_dummy, 1, L)
|
|
|
|
# set the sharey axis style
|
|
fig["layout"].update({"yaxis{}".format(1): make_YAxis("")})
|
|
fig["layout"].update(
|
|
title=title,
|
|
showlegend=False,
|
|
hovermode="closest",
|
|
autosize=False,
|
|
height=height,
|
|
width=width,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def violin_dict(
|
|
data,
|
|
data_header,
|
|
group_header,
|
|
colors,
|
|
use_colorscale,
|
|
group_stats,
|
|
rugplot,
|
|
sort,
|
|
height,
|
|
width,
|
|
title,
|
|
):
|
|
"""
|
|
Refer to FigureFactory.create_violin() for docstring.
|
|
|
|
Returns fig for violin plot without colorscale.
|
|
|
|
"""
|
|
|
|
# collect all group names
|
|
group_name = []
|
|
for name in data[group_header]:
|
|
if name not in group_name:
|
|
group_name.append(name)
|
|
|
|
if sort:
|
|
group_name.sort()
|
|
|
|
# check if all group names appear in colors dict
|
|
for group in group_name:
|
|
if group not in colors:
|
|
raise exceptions.PlotlyError(
|
|
"If colors is a dictionary, all "
|
|
"the group names must appear as "
|
|
"keys in colors."
|
|
)
|
|
|
|
gb = data.groupby([group_header])
|
|
L = len(group_name)
|
|
|
|
fig = make_subplots(
|
|
rows=1, cols=L, shared_yaxes=True, horizontal_spacing=0.025, print_grid=False
|
|
)
|
|
|
|
for k, gr in enumerate(group_name):
|
|
vals = np.asarray(gb.get_group(gr)[data_header], np.float)
|
|
plot_data, plot_xrange = violinplot(vals, fillcolor=colors[gr], rugplot=rugplot)
|
|
layout = graph_objs.Layout()
|
|
|
|
for item in plot_data:
|
|
fig.append_trace(item, 1, k + 1)
|
|
|
|
# add violin plot labels
|
|
fig["layout"].update(
|
|
{"xaxis{}".format(k + 1): make_XAxis(group_name[k], plot_xrange)}
|
|
)
|
|
|
|
# set the sharey axis style
|
|
fig["layout"].update({"yaxis{}".format(1): make_YAxis("")})
|
|
fig["layout"].update(
|
|
title=title,
|
|
showlegend=False,
|
|
hovermode="closest",
|
|
autosize=False,
|
|
height=height,
|
|
width=width,
|
|
)
|
|
|
|
return fig
|
|
|
|
|
|
def create_violin(
|
|
data,
|
|
data_header=None,
|
|
group_header=None,
|
|
colors=None,
|
|
use_colorscale=False,
|
|
group_stats=None,
|
|
rugplot=True,
|
|
sort=False,
|
|
height=450,
|
|
width=600,
|
|
title="Violin and Rug Plot",
|
|
):
|
|
"""
|
|
**deprecated**, use instead the plotly.graph_objects trace
|
|
:class:`plotly.graph_objects.Violin`.
|
|
|
|
:param (list|array) data: accepts either a list of numerical values,
|
|
a list of dictionaries all with identical keys and at least one
|
|
column of numeric values, or a pandas dataframe with at least one
|
|
column of numbers.
|
|
:param (str) data_header: the header of the data column to be used
|
|
from an inputted pandas dataframe. Not applicable if 'data' is
|
|
a list of numeric values.
|
|
:param (str) group_header: applicable if grouping data by a variable.
|
|
'group_header' must be set to the name of the grouping variable.
|
|
:param (str|tuple|list|dict) colors: either a plotly scale name,
|
|
an rgb or hex color, a color tuple, a list of colors or a
|
|
dictionary. An rgb color is of the form 'rgb(x, y, z)' where
|
|
x, y and z belong to the interval [0, 255] and a color tuple is a
|
|
tuple of the form (a, b, c) where a, b and c belong to [0, 1].
|
|
If colors is a list, it must contain valid color types as its
|
|
members.
|
|
:param (bool) use_colorscale: only applicable if grouping by another
|
|
variable. Will implement a colorscale based on the first 2 colors
|
|
of param colors. This means colors must be a list with at least 2
|
|
colors in it (Plotly colorscales are accepted since they map to a
|
|
list of two rgb colors). Default = False
|
|
:param (dict) group_stats: a dictionary where each key is a unique
|
|
value from the group_header column in data. Each value must be a
|
|
number and will be used to color the violin plots if a colorscale
|
|
is being used.
|
|
:param (bool) rugplot: determines if a rugplot is draw on violin plot.
|
|
Default = True
|
|
:param (bool) sort: determines if violins are sorted
|
|
alphabetically (True) or by input order (False). Default = False
|
|
:param (float) height: the height of the violin plot.
|
|
:param (float) width: the width of the violin plot.
|
|
:param (str) title: the title of the violin plot.
|
|
|
|
Example 1: Single Violin Plot
|
|
|
|
>>> from plotly.figure_factory import create_violin
|
|
>>> import plotly.graph_objs as graph_objects
|
|
|
|
>>> import numpy as np
|
|
>>> from scipy import stats
|
|
|
|
>>> # create list of random values
|
|
>>> data_list = np.random.randn(100)
|
|
|
|
>>> # create violin fig
|
|
>>> fig = create_violin(data_list, colors='#604d9e')
|
|
|
|
>>> # plot
|
|
>>> fig.show()
|
|
|
|
Example 2: Multiple Violin Plots with Qualitative Coloring
|
|
|
|
>>> from plotly.figure_factory import create_violin
|
|
>>> import plotly.graph_objs as graph_objects
|
|
|
|
>>> import numpy as np
|
|
>>> import pandas as pd
|
|
>>> from scipy import stats
|
|
|
|
>>> # create dataframe
|
|
>>> np.random.seed(619517)
|
|
>>> Nr=250
|
|
>>> y = np.random.randn(Nr)
|
|
>>> gr = np.random.choice(list("ABCDE"), Nr)
|
|
>>> norm_params=[(0, 1.2), (0.7, 1), (-0.5, 1.4), (0.3, 1), (0.8, 0.9)]
|
|
|
|
>>> for i, letter in enumerate("ABCDE"):
|
|
... y[gr == letter] *=norm_params[i][1]+ norm_params[i][0]
|
|
>>> df = pd.DataFrame(dict(Score=y, Group=gr))
|
|
|
|
>>> # create violin fig
|
|
>>> fig = create_violin(df, data_header='Score', group_header='Group',
|
|
... sort=True, height=600, width=1000)
|
|
|
|
>>> # plot
|
|
>>> fig.show()
|
|
|
|
Example 3: Violin Plots with Colorscale
|
|
|
|
>>> from plotly.figure_factory import create_violin
|
|
>>> import plotly.graph_objs as graph_objects
|
|
|
|
>>> import numpy as np
|
|
>>> import pandas as pd
|
|
>>> from scipy import stats
|
|
|
|
>>> # create dataframe
|
|
>>> np.random.seed(619517)
|
|
>>> Nr=250
|
|
>>> y = np.random.randn(Nr)
|
|
>>> gr = np.random.choice(list("ABCDE"), Nr)
|
|
>>> norm_params=[(0, 1.2), (0.7, 1), (-0.5, 1.4), (0.3, 1), (0.8, 0.9)]
|
|
|
|
>>> for i, letter in enumerate("ABCDE"):
|
|
... y[gr == letter] *=norm_params[i][1]+ norm_params[i][0]
|
|
>>> df = pd.DataFrame(dict(Score=y, Group=gr))
|
|
|
|
>>> # define header params
|
|
>>> data_header = 'Score'
|
|
>>> group_header = 'Group'
|
|
|
|
>>> # make groupby object with pandas
|
|
>>> group_stats = {}
|
|
>>> groupby_data = df.groupby([group_header])
|
|
|
|
>>> for group in "ABCDE":
|
|
... data_from_group = groupby_data.get_group(group)[data_header]
|
|
... # take a stat of the grouped data
|
|
... stat = np.median(data_from_group)
|
|
... # add to dictionary
|
|
... group_stats[group] = stat
|
|
|
|
>>> # create violin fig
|
|
>>> fig = create_violin(df, data_header='Score', group_header='Group',
|
|
... height=600, width=1000, use_colorscale=True,
|
|
... group_stats=group_stats)
|
|
|
|
>>> # plot
|
|
>>> fig.show()
|
|
"""
|
|
|
|
# Validate colors
|
|
if isinstance(colors, dict):
|
|
valid_colors = clrs.validate_colors_dict(colors, "rgb")
|
|
else:
|
|
valid_colors = clrs.validate_colors(colors, "rgb")
|
|
|
|
# validate data and choose plot type
|
|
if group_header is None:
|
|
if isinstance(data, list):
|
|
if len(data) <= 0:
|
|
raise exceptions.PlotlyError(
|
|
"If data is a list, it must be "
|
|
"nonempty and contain either "
|
|
"numbers or dictionaries."
|
|
)
|
|
|
|
if not all(isinstance(element, Number) for element in data):
|
|
raise exceptions.PlotlyError(
|
|
"If data is a list, it must " "contain only numbers."
|
|
)
|
|
|
|
if pd and isinstance(data, pd.core.frame.DataFrame):
|
|
if data_header is None:
|
|
raise exceptions.PlotlyError(
|
|
"data_header must be the "
|
|
"column name with the "
|
|
"desired numeric data for "
|
|
"the violin plot."
|
|
)
|
|
|
|
data = data[data_header].values.tolist()
|
|
|
|
# call the plotting functions
|
|
plot_data, plot_xrange = violinplot(
|
|
data, fillcolor=valid_colors[0], rugplot=rugplot
|
|
)
|
|
|
|
layout = graph_objs.Layout(
|
|
title=title,
|
|
autosize=False,
|
|
font=graph_objs.layout.Font(size=11),
|
|
height=height,
|
|
showlegend=False,
|
|
width=width,
|
|
xaxis=make_XAxis("", plot_xrange),
|
|
yaxis=make_YAxis(""),
|
|
hovermode="closest",
|
|
)
|
|
layout["yaxis"].update(dict(showline=False, showticklabels=False, ticks=""))
|
|
|
|
fig = graph_objs.Figure(data=plot_data, layout=layout)
|
|
|
|
return fig
|
|
|
|
else:
|
|
if not isinstance(data, pd.core.frame.DataFrame):
|
|
raise exceptions.PlotlyError(
|
|
"Error. You must use a pandas "
|
|
"DataFrame if you are using a "
|
|
"group header."
|
|
)
|
|
|
|
if data_header is None:
|
|
raise exceptions.PlotlyError(
|
|
"data_header must be the column "
|
|
"name with the desired numeric "
|
|
"data for the violin plot."
|
|
)
|
|
|
|
if use_colorscale is False:
|
|
if isinstance(valid_colors, dict):
|
|
# validate colors dict choice below
|
|
fig = violin_dict(
|
|
data,
|
|
data_header,
|
|
group_header,
|
|
valid_colors,
|
|
use_colorscale,
|
|
group_stats,
|
|
rugplot,
|
|
sort,
|
|
height,
|
|
width,
|
|
title,
|
|
)
|
|
return fig
|
|
else:
|
|
fig = violin_no_colorscale(
|
|
data,
|
|
data_header,
|
|
group_header,
|
|
valid_colors,
|
|
use_colorscale,
|
|
group_stats,
|
|
rugplot,
|
|
sort,
|
|
height,
|
|
width,
|
|
title,
|
|
)
|
|
return fig
|
|
else:
|
|
if isinstance(valid_colors, dict):
|
|
raise exceptions.PlotlyError(
|
|
"The colors param cannot be "
|
|
"a dictionary if you are "
|
|
"using a colorscale."
|
|
)
|
|
|
|
if len(valid_colors) < 2:
|
|
raise exceptions.PlotlyError(
|
|
"colors must be a list with "
|
|
"at least 2 colors. A "
|
|
"Plotly scale is allowed."
|
|
)
|
|
|
|
if not isinstance(group_stats, dict):
|
|
raise exceptions.PlotlyError(
|
|
"Your group_stats param " "must be a dictionary."
|
|
)
|
|
|
|
fig = violin_colorscale(
|
|
data,
|
|
data_header,
|
|
group_header,
|
|
valid_colors,
|
|
use_colorscale,
|
|
group_stats,
|
|
rugplot,
|
|
sort,
|
|
height,
|
|
width,
|
|
title,
|
|
)
|
|
return fig
|