2022-05-23 00:16:32 +04:00

354 lines
14 KiB
Cython

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ---------------------------------------------------------------------
# Implement Internal ExecPlan bindings
# cython: profile=False
# distutils: language = c++
# cython: language_level = 3
from cython.operator cimport dereference as deref, preincrement as inc
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.lib cimport (Table, check_status, pyarrow_unwrap_table, pyarrow_wrap_table)
from pyarrow.lib import tobytes
from pyarrow._compute cimport Expression, _true
from pyarrow._dataset cimport Dataset
from pyarrow._dataset import InMemoryDataset
Initialize() # Initialise support for Datasets in ExecPlan
cdef execplan(inputs, output_type, vector[CDeclaration] plan, c_bool use_threads=True):
"""
Internal Function to create an ExecPlan and run it.
Parameters
----------
inputs : list of Table or Dataset
The sources from which the ExecPlan should fetch data.
In most cases this is only one, unless the first node of the
plan is able to get data from multiple different sources.
output_type : Table or InMemoryDataset
In which format the output should be provided.
plan : vector[CDeclaration]
The nodes of the plan that should be applied to the sources
to produce the output.
use_threads : bool, default True
Whenever to use multithreading or not.
"""
cdef:
CExecutor *c_executor
shared_ptr[CExecContext] c_exec_context
shared_ptr[CExecPlan] c_exec_plan
vector[CDeclaration] c_decls
vector[CExecNode*] _empty
vector[CExecNode*] c_final_node_vec
CExecNode *c_node
CTable* c_table
shared_ptr[CTable] c_in_table
shared_ptr[CTable] c_out_table
shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
shared_ptr[CScanNodeOptions] c_scanopts
shared_ptr[CExecNodeOptions] c_input_node_opts
shared_ptr[CSinkNodeOptions] c_sinkopts
shared_ptr[CAsyncExecBatchGenerator] c_async_exec_batch_gen
shared_ptr[CRecordBatchReader] c_recordbatchreader
vector[CDeclaration].iterator plan_iter
vector[CDeclaration.Input] no_c_inputs
CStatus c_plan_status
if use_threads:
c_executor = GetCpuThreadPool()
else:
c_executor = NULL
c_exec_context = make_shared[CExecContext](
c_default_memory_pool(), c_executor)
c_exec_plan = GetResultValue(CExecPlan.Make(c_exec_context.get()))
plan_iter = plan.begin()
# Create source nodes for each input
for ipt in inputs:
if isinstance(ipt, Table):
node_factory = "table_source"
c_in_table = pyarrow_unwrap_table(ipt)
c_tablesourceopts = make_shared[CTableSourceNodeOptions](
c_in_table, 1 << 20)
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
c_tablesourceopts)
elif isinstance(ipt, Dataset):
node_factory = "scan"
c_in_dataset = (<Dataset>ipt).unwrap()
c_scanopts = make_shared[CScanNodeOptions](
c_in_dataset, make_shared[CScanOptions]())
deref(deref(c_scanopts).scan_options).use_threads = use_threads
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CScanNodeOptions](
c_scanopts)
else:
raise TypeError("Unsupported type")
if plan_iter != plan.end():
# Flag the source as the input of the first plan node.
deref(plan_iter).inputs.push_back(CDeclaration.Input(
CDeclaration(tobytes(node_factory),
no_c_inputs, c_input_node_opts)
))
else:
# Empty plan, make the source the first plan node.
c_decls.push_back(
CDeclaration(tobytes(node_factory),
no_c_inputs, c_input_node_opts)
)
# Add Here additional nodes
while plan_iter != plan.end():
c_decls.push_back(deref(plan_iter))
inc(plan_iter)
# Add all CDeclarations to the plan
c_node = GetResultValue(
CDeclaration.Sequence(c_decls).AddToPlan(&deref(c_exec_plan))
)
c_final_node_vec.push_back(c_node)
# Create the output node
c_async_exec_batch_gen = make_shared[CAsyncExecBatchGenerator]()
c_sinkopts = make_shared[CSinkNodeOptions](c_async_exec_batch_gen.get())
GetResultValue(
MakeExecNode(tobytes("sink"), &deref(c_exec_plan),
c_final_node_vec, deref(c_sinkopts))
)
# Convert the asyncgenerator to a sync batch reader
c_recordbatchreader = MakeGeneratorReader(c_node.output_schema(),
deref(c_async_exec_batch_gen),
deref(c_exec_context).memory_pool())
# Start execution of the ExecPlan
deref(c_exec_plan).Validate()
deref(c_exec_plan).StartProducing()
# Convert output to the expected one.
c_out_table = GetResultValue(
CTable.FromRecordBatchReader(c_recordbatchreader.get()))
if output_type == Table:
output = pyarrow_wrap_table(c_out_table)
elif output_type == InMemoryDataset:
output = InMemoryDataset(pyarrow_wrap_table(c_out_table))
else:
raise TypeError("Unsupported output type")
with nogil:
c_plan_status = deref(c_exec_plan).finished().status()
check_status(c_plan_status)
return output
def _perform_join(join_type, left_operand not None, left_keys,
right_operand not None, right_keys,
left_suffix=None, right_suffix=None,
use_threads=True, coalesce_keys=False,
output_type=Table):
"""
Perform join of two tables or datasets.
The result will be an output table with the result of the join operation
Parameters
----------
join_type : str
One of supported join types.
left_operand : Table or Dataset
The left operand for the join operation.
left_keys : str or list[str]
The left key (or keys) on which the join operation should be performed.
right_operand : Table or Dataset
The right operand for the join operation.
right_keys : str or list[str]
The right key (or keys) on which the join operation should be performed.
left_suffix : str, default None
Which suffix to add to right column names. This prevents confusion
when the columns in left and right operands have colliding names.
right_suffix : str, default None
Which suffic to add to the left column names. This prevents confusion
when the columns in left and right operands have colliding names.
use_threads : bool, default True
Whenever to use multithreading or not.
coalesce_keys : bool, default False
If the duplicated keys should be omitted from one of the sides
in the join result.
output_type: Table or InMemoryDataset
The output type for the exec plan result.
Returns
-------
result_table : Table
"""
cdef:
vector[CFieldRef] c_left_keys
vector[CFieldRef] c_right_keys
vector[CFieldRef] c_left_columns
vector[CFieldRef] c_right_columns
vector[CDeclaration] c_decl_plan
vector[CExpression] c_projections
vector[c_string] c_projected_col_names
CJoinType c_join_type
# Prepare left and right tables Keys to send them to the C++ function
left_keys_order = {}
if isinstance(left_keys, str):
left_keys = [left_keys]
for idx, key in enumerate(left_keys):
left_keys_order[key] = idx
c_left_keys.push_back(CFieldRef(<c_string>tobytes(key)))
right_keys_order = {}
if isinstance(right_keys, str):
right_keys = [right_keys]
for idx, key in enumerate(right_keys):
right_keys_order[key] = idx
c_right_keys.push_back(CFieldRef(<c_string>tobytes(key)))
# By default expose all columns on both left and right table
if isinstance(left_operand, Table):
left_columns = left_operand.column_names
elif isinstance(left_operand, Dataset):
left_columns = left_operand.schema.names
else:
raise TypeError("Unsupported left join member type")
if isinstance(right_operand, Table):
right_columns = right_operand.column_names
elif isinstance(right_operand, Dataset):
right_columns = right_operand.schema.names
else:
raise TypeError("Unsupported right join member type")
# Pick the join type
if join_type == "left semi":
c_join_type = CJoinType_LEFT_SEMI
right_columns = []
elif join_type == "right semi":
c_join_type = CJoinType_RIGHT_SEMI
left_columns = []
elif join_type == "left anti":
c_join_type = CJoinType_LEFT_ANTI
right_columns = []
elif join_type == "right anti":
c_join_type = CJoinType_RIGHT_ANTI
left_columns = []
elif join_type == "inner":
c_join_type = CJoinType_INNER
right_columns = set(right_columns) - set(right_keys)
elif join_type == "left outer":
c_join_type = CJoinType_LEFT_OUTER
right_columns = set(right_columns) - set(right_keys)
elif join_type == "right outer":
c_join_type = CJoinType_RIGHT_OUTER
left_columns = set(left_columns) - set(left_keys)
elif join_type == "full outer":
c_join_type = CJoinType_FULL_OUTER
else:
raise ValueError("Unsupported join type")
# Turn the columns to vectors of FieldRefs
# and set aside indices of keys.
left_column_keys_indices = {}
for idx, colname in enumerate(left_columns):
c_left_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
if colname in left_keys:
left_column_keys_indices[colname] = idx
right_column_keys_indices = {}
for idx, colname in enumerate(right_columns):
c_right_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
if colname in right_keys:
right_column_keys_indices[colname] = idx
# Add the join node to the execplan
if coalesce_keys:
c_decl_plan.push_back(
CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
c_join_type, c_left_keys, c_right_keys,
c_left_columns, c_right_columns,
_true,
<c_string>tobytes(left_suffix or ""),
<c_string>tobytes(right_suffix or "")
))
)
if join_type == "full outer":
# In case of full outer joins, the join operation will output all columns
# so that we can coalesce the keys and exclude duplicates in a subsequent projection.
left_columns_set = set(left_columns)
right_columns_set = set(right_columns)
# Where the right table columns start.
right_operand_index = len(left_columns)
for idx, col in enumerate(left_columns + right_columns):
if idx < len(left_columns) and col in left_column_keys_indices:
# Include keys only once and coalesce left+right table keys.
c_projected_col_names.push_back(tobytes(col))
# Get the index of the right key that is being paired
# with this left key. We do so by retrieving the name
# of the right key that is in the same position in the provided keys
# and then looking up the index for that name in the right table.
right_key_index = right_column_keys_indices[right_keys[left_keys_order[col]]]
c_projections.push_back(Expression.unwrap(
Expression._call("coalesce", [
Expression._field(idx), Expression._field(
right_operand_index+right_key_index)
])
))
elif idx >= right_operand_index and col in right_column_keys_indices:
# Do not include right table keys. As they would lead to duplicated keys.
continue
else:
# For all the other columns incude them as they are.
# Just recompute the suffixes that the join produced as the projection
# would lose them otherwise.
if left_suffix and idx < right_operand_index and col in right_columns_set:
col += left_suffix
if right_suffix and idx >= right_operand_index and col in left_columns_set:
col += right_suffix
c_projected_col_names.push_back(tobytes(col))
c_projections.push_back(
Expression.unwrap(Expression._field(idx)))
c_decl_plan.push_back(
CDeclaration(tobytes("project"), CProjectNodeOptions(
c_projections, c_projected_col_names))
)
else:
c_decl_plan.push_back(
CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
c_join_type, c_left_keys, c_right_keys,
_true,
<c_string>tobytes(left_suffix or ""),
<c_string>tobytes(right_suffix or "")
))
)
result_table = execplan([left_operand, right_operand],
plan=c_decl_plan,
output_type=output_type,
use_threads=use_threads)
return result_table