# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from collections import OrderedDict from collections.abc import Iterable import pickle import sys import weakref import numpy as np import pytest import pyarrow as pa import pyarrow.compute as pc def test_chunked_array_basics(): data = pa.chunked_array([], type=pa.string()) assert data.type == pa.string() assert data.to_pylist() == [] data.validate() data2 = pa.chunked_array([], type='binary') assert data2.type == pa.binary() with pytest.raises(ValueError): pa.chunked_array([]) data = pa.chunked_array([ [1, 2, 3], [4, 5, 6], [7, 8, 9] ]) assert isinstance(data.chunks, list) assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks) assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks()) assert len(data.chunks) == 3 assert data.get_total_buffer_size() == sum(c.get_total_buffer_size() for c in data.iterchunks()) assert sys.getsizeof(data) >= object.__sizeof__( data) + data.get_total_buffer_size() assert data.nbytes == 3 * 3 * 8 # 3 items per 3 lists with int64 size(8) data.validate() wr = weakref.ref(data) assert wr() is not None del data assert wr() is None def test_chunked_array_construction(): arr = pa.chunked_array([ [1, 2, 3], [4, 5, 6], [7, 8, 9], ]) assert arr.type == pa.int64() assert len(arr) == 9 assert len(arr.chunks) == 3 arr = pa.chunked_array([ [1, 2, 3], [4., 5., 6.], [7, 8, 9], ]) assert arr.type == pa.int64() assert len(arr) == 9 assert len(arr.chunks) == 3 arr = pa.chunked_array([ [1, 2, 3], [4., 5., 6.], [7, 8, 9], ], type=pa.int8()) assert arr.type == pa.int8() assert len(arr) == 9 assert len(arr.chunks) == 3 arr = pa.chunked_array([ [1, 2, 3], [] ]) assert arr.type == pa.int64() assert len(arr) == 3 assert len(arr.chunks) == 2 msg = ( "When passing an empty collection of arrays you must also pass the " "data type" ) with pytest.raises(ValueError, match=msg): assert pa.chunked_array([]) assert pa.chunked_array([], type=pa.string()).type == pa.string() assert pa.chunked_array([[]]).type == pa.null() assert pa.chunked_array([[]], type=pa.string()).type == pa.string() def test_combine_chunks(): # ARROW-77363 arr = pa.array([1, 2]) chunked_arr = pa.chunked_array([arr, arr]) res = chunked_arr.combine_chunks() expected = pa.array([1, 2, 1, 2]) assert res.equals(expected) def test_chunked_array_to_numpy(): data = pa.chunked_array([ [1, 2, 3], [4, 5, 6], [] ]) arr1 = np.asarray(data) arr2 = data.to_numpy() assert isinstance(arr2, np.ndarray) assert arr2.shape == (6,) assert np.array_equal(arr1, arr2) def test_chunked_array_mismatch_types(): with pytest.raises(TypeError): # Given array types are different pa.chunked_array([ pa.array([1, 2, 3]), pa.array([1., 2., 3.]) ]) with pytest.raises(TypeError): # Given array type is different from explicit type argument pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64()) def test_chunked_array_str(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) assert str(data) == """[ [ 1, 2, 3 ], [ 4, 5, 6 ] ]""" def test_chunked_array_getitem(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) assert data[1].as_py() == 2 assert data[-1].as_py() == 6 assert data[-6].as_py() == 1 with pytest.raises(IndexError): data[6] with pytest.raises(IndexError): data[-7] # Ensure this works with numpy scalars assert data[np.int32(1)].as_py() == 2 data_slice = data[2:4] assert data_slice.to_pylist() == [3, 4] data_slice = data[4:-1] assert data_slice.to_pylist() == [5] data_slice = data[99:99] assert data_slice.type == data.type assert data_slice.to_pylist() == [] def test_chunked_array_slice(): data = [ pa.array([1, 2, 3]), pa.array([4, 5, 6]) ] data = pa.chunked_array(data) data_slice = data.slice(len(data)) assert data_slice.type == data.type assert data_slice.to_pylist() == [] data_slice = data.slice(len(data) + 10) assert data_slice.type == data.type assert data_slice.to_pylist() == [] table = pa.Table.from_arrays([data], names=["a"]) table_slice = table.slice(len(table)) assert len(table_slice) == 0 table = pa.Table.from_arrays([data], names=["a"]) table_slice = table.slice(len(table) + 10) assert len(table_slice) == 0 def test_chunked_array_iter(): data = [ pa.array([0]), pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([7, 8, 9]) ] arr = pa.chunked_array(data) for i, j in zip(range(10), arr): assert i == j.as_py() assert isinstance(arr, Iterable) def test_chunked_array_equals(): def eq(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert x.equals(y) assert y.equals(x) assert x == y assert x != str(y) def ne(xarrs, yarrs): if isinstance(xarrs, pa.ChunkedArray): x = xarrs else: x = pa.chunked_array(xarrs) if isinstance(yarrs, pa.ChunkedArray): y = yarrs else: y = pa.chunked_array(yarrs) assert not x.equals(y) assert not y.equals(x) assert x != y eq(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int32())) ne(pa.chunked_array([], type=pa.int32()), pa.chunked_array([], type=pa.int64())) a = pa.array([0, 2], type=pa.int32()) b = pa.array([0, 2], type=pa.int64()) c = pa.array([0, 3], type=pa.int32()) d = pa.array([0, 2, 0, 3], type=pa.int32()) eq([a], [a]) ne([a], [b]) eq([a, c], [a, c]) eq([a, c], [d]) ne([c, a], [a, c]) # ARROW-4822 assert not pa.chunked_array([], type=pa.int32()).equals(None) @pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), ([1, 2, 4, 6], pa.int64()), ([1.0, 2.5, None], pa.float64()), (['a', None, 'b'], pa.string()), ([], pa.list_(pa.uint8())), ([[1, 2], [3]], pa.list_(pa.int64())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) def test_chunked_array_pickle(data, typ): arrays = [] while data: arrays.append(pa.array(data[:2], type=typ)) data = data[2:] array = pa.chunked_array(arrays, type=typ) array.validate() result = pickle.loads(pickle.dumps(array)) result.validate() assert result.equals(array) @pytest.mark.pandas def test_chunked_array_to_pandas(): import pandas as pd data = [ pa.array([-10, -5, 0, 5, 10]) ] table = pa.table(data, names=['a']) col = table.column(0) assert isinstance(col, pa.ChunkedArray) series = col.to_pandas() assert isinstance(series, pd.Series) assert series.shape == (5,) assert series[0] == -10 assert series.name == 'a' @pytest.mark.pandas def test_chunked_array_to_pandas_preserve_name(): # https://issues.apache.org/jira/browse/ARROW-7709 import pandas as pd import pandas.testing as tm for data in [ pa.array([1, 2, 3]), pa.array(pd.Categorical(["a", "b", "a"])), pa.array(pd.date_range("2012", periods=3)), pa.array(pd.date_range("2012", periods=3, tz="Europe/Brussels")), pa.array([1, 2, 3], pa.timestamp("ms")), pa.array([1, 2, 3], pa.timestamp("ms", "Europe/Brussels"))]: table = pa.table({"name": data}) result = table.column("name").to_pandas() assert result.name == "name" expected = pd.Series(data.to_pandas(), name="name") tm.assert_series_equal(result, expected) @pytest.mark.pandas def test_table_roundtrip_to_pandas_empty_dataframe(): # https://issues.apache.org/jira/browse/ARROW-10643 # The conversion should not results in a table with 0 rows if the original # DataFrame has a RangeIndex but is empty. import pandas as pd data = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) table = pa.table(data) result = table.to_pandas() assert table.num_rows == 10 assert data.shape == (10, 0) assert result.shape == (10, 0) assert result.index.equals(data.index) data = pd.DataFrame(index=pd.RangeIndex(0, 10, 3)) table = pa.table(data) result = table.to_pandas() assert table.num_rows == 4 assert data.shape == (4, 0) assert result.shape == (4, 0) assert result.index.equals(data.index) @pytest.mark.pandas def test_recordbatch_roundtrip_to_pandas_empty_dataframe(): # https://issues.apache.org/jira/browse/ARROW-10643 # The conversion should not results in a RecordBatch with 0 rows if # the original DataFrame has a RangeIndex but is empty. import pandas as pd data = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) batch = pa.RecordBatch.from_pandas(data) result = batch.to_pandas() assert batch.num_rows == 10 assert data.shape == (10, 0) assert result.shape == (10, 0) assert result.index.equals(data.index) data = pd.DataFrame(index=pd.RangeIndex(0, 10, 3)) batch = pa.RecordBatch.from_pandas(data) result = batch.to_pandas() assert batch.num_rows == 4 assert data.shape == (4, 0) assert result.shape == (4, 0) assert result.index.equals(data.index) @pytest.mark.pandas def test_to_pandas_empty_table(): # https://issues.apache.org/jira/browse/ARROW-15370 import pandas as pd import pandas.testing as tm df = pd.DataFrame({'a': [1, 2], 'b': [0.1, 0.2]}) table = pa.table(df) result = table.schema.empty_table().to_pandas() assert result.shape == (0, 2) tm.assert_frame_equal(result, df.iloc[:0]) @pytest.mark.pandas @pytest.mark.nopandas def test_chunked_array_asarray(): # ensure this is tested both when pandas is present or not (ARROW-6564) data = [ pa.array([0]), pa.array([1, 2, 3]) ] chunked_arr = pa.chunked_array(data) np_arr = np.asarray(chunked_arr) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('int64') # An optional type can be specified when calling np.asarray np_arr = np.asarray(chunked_arr, dtype='str') assert np_arr.tolist() == ['0', '1', '2', '3'] # Types are modified when there are nulls data = [ pa.array([1, None]), pa.array([1, 2, 3]) ] chunked_arr = pa.chunked_array(data) np_arr = np.asarray(chunked_arr) elements = np_arr.tolist() assert elements[0] == 1. assert np.isnan(elements[1]) assert elements[2:] == [1., 2., 3.] assert np_arr.dtype == np.dtype('float64') # DictionaryType data will be converted to dense numpy array arr = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 0, 1]), pa.array(['a', 'b', 'c'])) chunked_arr = pa.chunked_array([arr, arr]) np_arr = np.asarray(chunked_arr) assert np_arr.dtype == np.dtype('object') assert np_arr.tolist() == ['a', 'b', 'c', 'a', 'b'] * 2 def test_chunked_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) carr = pa.chunked_array(a) x, y = carr.flatten() assert x.equals(pa.chunked_array(pa.array([1, 3, 5], type=pa.int16()))) assert y.equals(pa.chunked_array(pa.array([2.5, 4.5, 6.5], type=pa.float32()))) # Empty column a = pa.array([], type=ty) carr = pa.chunked_array(a) x, y = carr.flatten() assert x.equals(pa.chunked_array(pa.array([], type=pa.int16()))) assert y.equals(pa.chunked_array(pa.array([], type=pa.float32()))) def test_chunked_array_unify_dictionaries(): arr = pa.chunked_array([ pa.array(["foo", "bar", None, "foo"]).dictionary_encode(), pa.array(["quux", None, "foo"]).dictionary_encode(), ]) assert arr.chunk(0).dictionary.equals(pa.array(["foo", "bar"])) assert arr.chunk(1).dictionary.equals(pa.array(["quux", "foo"])) arr = arr.unify_dictionaries() expected_dict = pa.array(["foo", "bar", "quux"]) assert arr.chunk(0).dictionary.equals(expected_dict) assert arr.chunk(1).dictionary.equals(expected_dict) assert arr.to_pylist() == ["foo", "bar", None, "foo", "quux", None, "foo"] def test_recordbatch_basics(): data = [ pa.array(range(5), type='int16'), pa.array([-10, -5, 0, None, 10], type='int32') ] batch = pa.record_batch(data, ['c0', 'c1']) assert not batch.schema.metadata assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) # (only the second array has a null bitmap) assert batch.get_total_buffer_size() == (5 * 2) + (5 * 4 + 1) batch.nbytes == (5 * 2) + (5 * 4 + 1) assert sys.getsizeof(batch) >= object.__sizeof__( batch) + batch.get_total_buffer_size() pydict = batch.to_pydict() assert pydict == OrderedDict([ ('c0', [0, 1, 2, 3, 4]), ('c1', [-10, -5, 0, None, 10]) ]) assert type(pydict) == dict with pytest.raises(IndexError): # bounds checking batch[2] # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16(), metadata={'key': 'value'}), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) batch = pa.record_batch(data, schema=schema) assert batch.schema == schema # schema as first positional argument batch = pa.record_batch(data, schema) assert batch.schema == schema assert str(batch) == """pyarrow.RecordBatch c0: int16 c1: int32""" assert batch.to_string(show_metadata=True) == """\ pyarrow.RecordBatch c0: int16 -- field metadata -- key: 'value' c1: int32 -- schema metadata -- foo: 'bar'""" wr = weakref.ref(batch) assert wr() is not None del batch assert wr() is None def test_recordbatch_equals(): data1 = [ pa.array(range(5), type='int16'), pa.array([-10, -5, 0, None, 10], type='int32') ] data2 = [ pa.array(['a', 'b', 'c']), pa.array([['d'], ['e'], ['f']]), ] column_names = ['c0', 'c1'] batch = pa.record_batch(data1, column_names) assert batch == pa.record_batch(data1, column_names) assert batch.equals(pa.record_batch(data1, column_names)) assert batch != pa.record_batch(data2, column_names) assert not batch.equals(pa.record_batch(data2, column_names)) batch_meta = pa.record_batch(data1, names=column_names, metadata={'key': 'value'}) assert batch_meta.equals(batch) assert not batch_meta.equals(batch, check_metadata=True) # ARROW-8889 assert not batch.equals(None) assert batch != "foo" def test_recordbatch_take(): batch = pa.record_batch( [pa.array([1, 2, 3, None, 5]), pa.array(['a', 'b', 'c', 'd', 'e'])], ['f1', 'f2']) assert batch.take(pa.array([2, 3])).equals(batch.slice(2, 2)) assert batch.take(pa.array([2, None])).equals( pa.record_batch([pa.array([3, None]), pa.array(['c', None])], ['f1', 'f2'])) def test_recordbatch_column_sets_private_name(): # ARROW-6429 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) assert rb[0]._name == 'a0' def test_recordbatch_from_arrays_validate_schema(): # ARROW-6263 arr = pa.array([1, 2]) schema = pa.schema([pa.field('f0', pa.list_(pa.utf8()))]) with pytest.raises(NotImplementedError): pa.record_batch([arr], schema=schema) def test_recordbatch_from_arrays_validate_lengths(): # ARROW-2820 data = [pa.array([1]), pa.array(["tokyo", "like", "happy"]), pa.array(["derek"])] with pytest.raises(ValueError): pa.record_batch(data, ['id', 'tags', 'name']) def test_recordbatch_no_fields(): batch = pa.record_batch([], []) assert len(batch) == 0 assert batch.num_rows == 0 assert batch.num_columns == 0 def test_recordbatch_from_arrays_invalid_names(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] with pytest.raises(ValueError): pa.record_batch(data, names=['a', 'b', 'c']) with pytest.raises(ValueError): pa.record_batch(data, names=['a']) def test_recordbatch_empty_metadata(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] batch = pa.record_batch(data, ['c0', 'c1']) assert batch.schema.metadata is None def test_recordbatch_pickle(): data = [ pa.array(range(5), type='int8'), pa.array([-10, -5, 0, 5, 10], type='float32') ] fields = [ pa.field('ints', pa.int8()), pa.field('floats', pa.float32()), ] schema = pa.schema(fields, metadata={b'foo': b'bar'}) batch = pa.record_batch(data, schema=schema) result = pickle.loads(pickle.dumps(batch)) assert result.equals(batch) assert result.schema == schema def test_recordbatch_get_field(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] batch = pa.RecordBatch.from_arrays(data, names=('a', 'b', 'c')) assert batch.field('a').equals(batch.schema.field('a')) assert batch.field(0).equals(batch.schema.field('a')) with pytest.raises(KeyError): batch.field('d') with pytest.raises(TypeError): batch.field(None) with pytest.raises(IndexError): batch.field(4) def test_recordbatch_select_column(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] batch = pa.RecordBatch.from_arrays(data, names=('a', 'b', 'c')) assert batch.column('a').equals(batch.column(0)) with pytest.raises( KeyError, match='Field "d" does not exist in record batch schema'): batch.column('d') with pytest.raises(TypeError): batch.column(None) with pytest.raises(IndexError): batch.column(4) def test_recordbatch_from_struct_array_invalid(): with pytest.raises(TypeError): pa.RecordBatch.from_struct_array(pa.array(range(5))) def test_recordbatch_from_struct_array(): struct_array = pa.array( [{"ints": 1}, {"floats": 1.0}], type=pa.struct([("ints", pa.int32()), ("floats", pa.float32())]), ) result = pa.RecordBatch.from_struct_array(struct_array) assert result.equals(pa.RecordBatch.from_arrays( [ pa.array([1, None], type=pa.int32()), pa.array([None, 1.0], type=pa.float32()), ], ["ints", "floats"] )) def _table_like_slice_tests(factory): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] names = ['c0', 'c1'] obj = factory(data, names=names) sliced = obj.slice(2) assert sliced.num_rows == 3 expected = factory([x.slice(2) for x in data], names=names) assert sliced.equals(expected) sliced2 = obj.slice(2, 2) expected2 = factory([x.slice(2, 2) for x in data], names=names) assert sliced2.equals(expected2) # 0 offset assert obj.slice(0).equals(obj) # Slice past end of array assert len(obj.slice(len(obj))) == 0 with pytest.raises(IndexError): obj.slice(-1) # Check __getitem__-based slicing assert obj.slice(0, 0).equals(obj[:0]) assert obj.slice(0, 2).equals(obj[:2]) assert obj.slice(2, 2).equals(obj[2:4]) assert obj.slice(2, len(obj) - 2).equals(obj[2:]) assert obj.slice(len(obj) - 2, 2).equals(obj[-2:]) assert obj.slice(len(obj) - 4, 2).equals(obj[-4:-2]) def test_recordbatch_slice_getitem(): return _table_like_slice_tests(pa.RecordBatch.from_arrays) def test_table_slice_getitem(): return _table_like_slice_tests(pa.table) @pytest.mark.pandas def test_slice_zero_length_table(): # ARROW-7907: a segfault on this code was fixed after 0.16.0 table = pa.table({'a': pa.array([], type=pa.timestamp('us'))}) table_slice = table.slice(0, 0) table_slice.to_pandas() table = pa.table({'a': pa.chunked_array([], type=pa.string())}) table.to_pandas() def test_recordbatchlist_schema_equals(): a1 = np.array([1], dtype='uint32') a2 = np.array([4.0, 5.0], dtype='float64') batch1 = pa.record_batch([pa.array(a1)], ['c1']) batch2 = pa.record_batch([pa.array(a2)], ['c1']) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([batch1, batch2]) def test_table_column_sets_private_name(): # ARROW-6429 t = pa.table([pa.array([1, 2, 3, 4])], names=['a0']) assert t[0]._name == 'a0' def test_table_equals(): table = pa.Table.from_arrays([], names=[]) assert table.equals(table) # ARROW-4822 assert not table.equals(None) other = pa.Table.from_arrays([], names=[], metadata={'key': 'value'}) assert not table.equals(other, check_metadata=True) assert table.equals(other) def test_table_from_batches_and_schema(): schema = pa.schema([ pa.field('a', pa.int64()), pa.field('b', pa.float64()), ]) batch = pa.record_batch([pa.array([1]), pa.array([3.14])], names=['a', 'b']) table = pa.Table.from_batches([batch], schema) assert table.schema.equals(schema) assert table.column(0) == pa.chunked_array([[1]]) assert table.column(1) == pa.chunked_array([[3.14]]) incompatible_schema = pa.schema([pa.field('a', pa.int64())]) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([batch], incompatible_schema) incompatible_batch = pa.record_batch([pa.array([1])], ['a']) with pytest.raises(pa.ArrowInvalid): pa.Table.from_batches([incompatible_batch], schema) @pytest.mark.pandas def test_table_to_batches(): from pandas.testing import assert_frame_equal import pandas as pd df1 = pd.DataFrame({'a': list(range(10))}) df2 = pd.DataFrame({'a': list(range(10, 30))}) batch1 = pa.RecordBatch.from_pandas(df1, preserve_index=False) batch2 = pa.RecordBatch.from_pandas(df2, preserve_index=False) table = pa.Table.from_batches([batch1, batch2, batch1]) expected_df = pd.concat([df1, df2, df1], ignore_index=True) batches = table.to_batches() assert len(batches) == 3 assert_frame_equal(pa.Table.from_batches(batches).to_pandas(), expected_df) batches = table.to_batches(max_chunksize=15) assert list(map(len, batches)) == [10, 15, 5, 10] assert_frame_equal(table.to_pandas(), expected_df) assert_frame_equal(pa.Table.from_batches(batches).to_pandas(), expected_df) table_from_iter = pa.Table.from_batches(iter([batch1, batch2, batch1])) assert table.equals(table_from_iter) def test_table_basics(): data = [ pa.array(range(5), type='int64'), pa.array([-10, -5, 0, 5, 10], type='int64') ] table = pa.table(data, names=('a', 'b')) table.validate() assert len(table) == 5 assert table.num_rows == 5 assert table.num_columns == 2 assert table.shape == (5, 2) assert table.get_total_buffer_size() == 2 * (5 * 8) assert table.nbytes == 2 * (5 * 8) assert sys.getsizeof(table) >= object.__sizeof__( table) + table.get_total_buffer_size() pydict = table.to_pydict() assert pydict == OrderedDict([ ('a', [0, 1, 2, 3, 4]), ('b', [-10, -5, 0, 5, 10]) ]) assert type(pydict) == dict columns = [] for col in table.itercolumns(): columns.append(col) for chunk in col.iterchunks(): assert chunk is not None with pytest.raises(IndexError): col.chunk(-1) with pytest.raises(IndexError): col.chunk(col.num_chunks) assert table.columns == columns assert table == pa.table(columns, names=table.column_names) assert table != pa.table(columns[1:], names=table.column_names[1:]) assert table != columns wr = weakref.ref(table) assert wr() is not None del table assert wr() is None def test_table_from_arrays_preserves_column_metadata(): # Added to test https://issues.apache.org/jira/browse/ARROW-3866 arr0 = pa.array([1, 2]) arr1 = pa.array([3, 4]) field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) field1 = pa.field('field2', pa.int64(), nullable=False) table = pa.Table.from_arrays([arr0, arr1], schema=pa.schema([field0, field1])) assert b"a" in table.field(0).metadata assert table.field(1).nullable is False def test_table_from_arrays_invalid_names(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] with pytest.raises(ValueError): pa.Table.from_arrays(data, names=['a', 'b', 'c']) with pytest.raises(ValueError): pa.Table.from_arrays(data, names=['a']) def test_table_from_lists(): data = [ list(range(5)), [-10, -5, 0, 5, 10] ] result = pa.table(data, names=['a', 'b']) expected = pa.Table.from_arrays(data, names=['a', 'b']) assert result.equals(expected) schema = pa.schema([ pa.field('a', pa.uint16()), pa.field('b', pa.int64()) ]) result = pa.table(data, schema=schema) expected = pa.Table.from_arrays(data, schema=schema) assert result.equals(expected) def test_table_pickle(): data = [ pa.chunked_array([[1, 2], [3, 4]], type=pa.uint32()), pa.chunked_array([["some", "strings", None, ""]], type=pa.string()), ] schema = pa.schema([pa.field('ints', pa.uint32()), pa.field('strs', pa.string())], metadata={b'foo': b'bar'}) table = pa.Table.from_arrays(data, schema=schema) result = pickle.loads(pickle.dumps(table)) result.validate() assert result.equals(table) def test_table_get_field(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) assert table.field('a').equals(table.schema.field('a')) assert table.field(0).equals(table.schema.field('a')) with pytest.raises(KeyError): table.field('d') with pytest.raises(TypeError): table.field(None) with pytest.raises(IndexError): table.field(4) def test_table_select_column(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) assert table.column('a').equals(table.column(0)) with pytest.raises(KeyError, match='Field "d" does not exist in table schema'): table.column('d') with pytest.raises(TypeError): table.column(None) with pytest.raises(IndexError): table.column(4) def test_table_column_with_duplicates(): # ARROW-8209 table = pa.table([pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([7, 8, 9])], names=['a', 'b', 'a']) with pytest.raises(KeyError, match='Field "a" exists 2 times in table schema'): table.column('a') def test_table_add_column(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) new_field = pa.field('d', data[1].type) t2 = table.add_column(3, new_field, data[1]) t3 = table.append_column(new_field, data[1]) expected = pa.Table.from_arrays(data + [data[1]], names=('a', 'b', 'c', 'd')) assert t2.equals(expected) assert t3.equals(expected) t4 = table.add_column(0, new_field, data[1]) expected = pa.Table.from_arrays([data[1]] + data, names=('d', 'a', 'b', 'c')) assert t4.equals(expected) def test_table_set_column(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) new_field = pa.field('d', data[1].type) t2 = table.set_column(0, new_field, data[1]) expected_data = list(data) expected_data[0] = data[1] expected = pa.Table.from_arrays(expected_data, names=('d', 'b', 'c')) assert t2.equals(expected) def test_table_drop(): """ drop one or more columns given labels""" a = pa.array(range(5)) b = pa.array([-10, -5, 0, 5, 10]) c = pa.array(range(5, 10)) table = pa.Table.from_arrays([a, b, c], names=('a', 'b', 'c')) t2 = table.drop(['a', 'b']) exp = pa.Table.from_arrays([c], names=('c',)) assert exp.equals(t2) # -- raise KeyError if column not in Table with pytest.raises(KeyError, match="Column 'd' not found"): table.drop(['d']) def test_table_remove_column(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) t2 = table.remove_column(0) t2.validate() expected = pa.Table.from_arrays(data[1:], names=('b', 'c')) assert t2.equals(expected) def test_table_remove_column_empty(): # ARROW-1865 data = [ pa.array(range(5)), ] table = pa.Table.from_arrays(data, names=['a']) t2 = table.remove_column(0) t2.validate() assert len(t2) == len(table) t3 = t2.add_column(0, table.field(0), table[0]) t3.validate() assert t3.equals(table) def test_empty_table_with_names(): # ARROW-13784 data = [] names = ["a", "b"] message = ( 'Length of names [(]2[)] does not match length of arrays [(]0[)]') with pytest.raises(ValueError, match=message): pa.Table.from_arrays(data, names=names) def test_empty_table(): table = pa.table([]) assert table.column_names == [] assert table.equals(pa.Table.from_arrays([], [])) def test_table_rename_columns(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array(range(5, 10)) ] table = pa.Table.from_arrays(data, names=['a', 'b', 'c']) assert table.column_names == ['a', 'b', 'c'] t2 = table.rename_columns(['eh', 'bee', 'sea']) t2.validate() assert t2.column_names == ['eh', 'bee', 'sea'] expected = pa.Table.from_arrays(data, names=['eh', 'bee', 'sea']) assert t2.equals(expected) def test_table_flatten(): ty1 = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) ty2 = pa.struct([pa.field('nest', ty1)]) a = pa.array([(1, 2.5), (3, 4.5)], type=ty1) b = pa.array([((11, 12.5),), ((13, 14.5),)], type=ty2) c = pa.array([False, True], type=pa.bool_()) table = pa.Table.from_arrays([a, b, c], names=['a', 'b', 'c']) t2 = table.flatten() t2.validate() expected = pa.Table.from_arrays([ pa.array([1, 3], type=pa.int16()), pa.array([2.5, 4.5], type=pa.float32()), pa.array([(11, 12.5), (13, 14.5)], type=ty1), c], names=['a.x', 'a.y', 'b.nest', 'c']) assert t2.equals(expected) def test_table_combine_chunks(): batch1 = pa.record_batch([pa.array([1]), pa.array(["a"])], names=['f1', 'f2']) batch2 = pa.record_batch([pa.array([2]), pa.array(["b"])], names=['f1', 'f2']) table = pa.Table.from_batches([batch1, batch2]) combined = table.combine_chunks() combined.validate() assert combined.equals(table) for c in combined.columns: assert c.num_chunks == 1 def test_table_unify_dictionaries(): batch1 = pa.record_batch([ pa.array(["foo", "bar", None, "foo"]).dictionary_encode(), pa.array([123, 456, 456, 789]).dictionary_encode(), pa.array([True, False, None, None])], names=['a', 'b', 'c']) batch2 = pa.record_batch([ pa.array(["quux", "foo", None, "quux"]).dictionary_encode(), pa.array([456, 789, 789, None]).dictionary_encode(), pa.array([False, None, None, True])], names=['a', 'b', 'c']) table = pa.Table.from_batches([batch1, batch2]) table = table.replace_schema_metadata({b"key1": b"value1"}) assert table.column(0).chunk(0).dictionary.equals( pa.array(["foo", "bar"])) assert table.column(0).chunk(1).dictionary.equals( pa.array(["quux", "foo"])) assert table.column(1).chunk(0).dictionary.equals( pa.array([123, 456, 789])) assert table.column(1).chunk(1).dictionary.equals( pa.array([456, 789])) table = table.unify_dictionaries(pa.default_memory_pool()) expected_dict_0 = pa.array(["foo", "bar", "quux"]) expected_dict_1 = pa.array([123, 456, 789]) assert table.column(0).chunk(0).dictionary.equals(expected_dict_0) assert table.column(0).chunk(1).dictionary.equals(expected_dict_0) assert table.column(1).chunk(0).dictionary.equals(expected_dict_1) assert table.column(1).chunk(1).dictionary.equals(expected_dict_1) assert table.to_pydict() == { 'a': ["foo", "bar", None, "foo", "quux", "foo", None, "quux"], 'b': [123, 456, 456, 789, 456, 789, 789, None], 'c': [True, False, None, None, False, None, None, True], } assert table.schema.metadata == {b"key1": b"value1"} def test_concat_tables(): data = [ list(range(5)), [-10., -5., 0., 5., 10.] ] data2 = [ list(range(5, 10)), [1., 2., 3., 4., 5.] ] t1 = pa.Table.from_arrays([pa.array(x) for x in data], names=('a', 'b')) t2 = pa.Table.from_arrays([pa.array(x) for x in data2], names=('a', 'b')) result = pa.concat_tables([t1, t2]) result.validate() assert len(result) == 10 expected = pa.Table.from_arrays([pa.array(x + y) for x, y in zip(data, data2)], names=('a', 'b')) assert result.equals(expected) def test_concat_tables_none_table(): # ARROW-11997 with pytest.raises(AttributeError): pa.concat_tables([None]) @pytest.mark.pandas def test_concat_tables_with_different_schema_metadata(): import pandas as pd schema = pa.schema([ pa.field('a', pa.string()), pa.field('b', pa.string()), ]) values = list('abcdefgh') df1 = pd.DataFrame({'a': values, 'b': values}) df2 = pd.DataFrame({'a': [np.nan] * 8, 'b': values}) table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False) table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False) assert table1.schema.equals(table2.schema) assert not table1.schema.equals(table2.schema, check_metadata=True) table3 = pa.concat_tables([table1, table2]) assert table1.schema.equals(table3.schema, check_metadata=True) assert table2.schema.equals(table3.schema) def test_concat_tables_with_promotion(): t1 = pa.Table.from_arrays( [pa.array([1, 2], type=pa.int64())], ["int64_field"]) t2 = pa.Table.from_arrays( [pa.array([1.0, 2.0], type=pa.float32())], ["float_field"]) result = pa.concat_tables([t1, t2], promote=True) assert result.equals(pa.Table.from_arrays([ pa.array([1, 2, None, None], type=pa.int64()), pa.array([None, None, 1.0, 2.0], type=pa.float32()), ], ["int64_field", "float_field"])) def test_concat_tables_with_promotion_error(): t1 = pa.Table.from_arrays( [pa.array([1, 2], type=pa.int64())], ["f"]) t2 = pa.Table.from_arrays( [pa.array([1, 2], type=pa.float32())], ["f"]) with pytest.raises(pa.ArrowInvalid): pa.concat_tables([t1, t2], promote=True) def test_table_negative_indexing(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), pa.array([1.0, 2.0, 3.0, 4.0, 5.0]), pa.array(['ab', 'bc', 'cd', 'de', 'ef']), ] table = pa.Table.from_arrays(data, names=tuple('abcd')) assert table[-1].equals(table[3]) assert table[-2].equals(table[2]) assert table[-3].equals(table[1]) assert table[-4].equals(table[0]) with pytest.raises(IndexError): table[-5] with pytest.raises(IndexError): table[4] def test_table_cast_to_incompatible_schema(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]), ] table = pa.Table.from_arrays(data, names=tuple('ab')) target_schema1 = pa.schema([ pa.field('A', pa.int32()), pa.field('b', pa.int16()), ]) target_schema2 = pa.schema([ pa.field('a', pa.int32()), ]) message = ("Target schema's field names are not matching the table's " "field names:.*") with pytest.raises(ValueError, match=message): table.cast(target_schema1) with pytest.raises(ValueError, match=message): table.cast(target_schema2) def test_table_safe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.0, 2.0, 3.0, 4.0, 5.0], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) casted_table = table.cast(target_schema) assert casted_table.equals(expected_table) def test_table_unsafe_casting(): data = [ pa.array(range(5), type=pa.int64()), pa.array([-10, -5, 0, 5, 10], type=pa.int32()), pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] table = pa.Table.from_arrays(data, names=tuple('abcd')) expected_data = [ pa.array(range(5), type=pa.int32()), pa.array([-10, -5, 0, 5, 10], type=pa.int16()), pa.array([1, 2, 3, 4, 5], type=pa.int64()), pa.array(['ab', 'bc', 'cd', 'de', 'ef'], type=pa.string()) ] expected_table = pa.Table.from_arrays(expected_data, names=tuple('abcd')) target_schema = pa.schema([ pa.field('a', pa.int32()), pa.field('b', pa.int16()), pa.field('c', pa.int64()), pa.field('d', pa.string()) ]) with pytest.raises(pa.ArrowInvalid, match='truncated'): table.cast(target_schema) casted_table = table.cast(target_schema, safe=False) assert casted_table.equals(expected_table) def test_invalid_table_construct(): array = np.array([0, 1], dtype=np.uint8) u8 = pa.uint8() arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)] with pytest.raises(pa.lib.ArrowInvalid): pa.Table.from_arrays(arrays, names=["a1", "a2"]) @pytest.mark.parametrize('data, klass', [ ((['', 'foo', 'bar'], [4.5, 5, None]), list), ((['', 'foo', 'bar'], [4.5, 5, None]), pa.array), (([[''], ['foo', 'bar']], [[4.5], [5., None]]), pa.chunked_array), ]) def test_from_arrays_schema(data, klass): data = [klass(data[0]), klass(data[1])] schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_arrays(data, schema=schema) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # length of data and schema not matching schema = pa.schema([('strs', pa.utf8())]) with pytest.raises(ValueError): pa.Table.from_arrays(data, schema=schema) # with different but compatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_arrays(data, schema=schema) assert pa.types.is_float32(table.column('floats').type) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): pa.Table.from_pydict(data, schema=schema) # Cannot pass both schema and metadata / names with pytest.raises(ValueError): pa.Table.from_arrays(data, schema=schema, names=['strs', 'floats']) with pytest.raises(ValueError): pa.Table.from_arrays(data, schema=schema, metadata={b'foo': b'bar'}) @pytest.mark.parametrize( ('cls'), [ (pa.Table), (pa.RecordBatch) ] ) def test_table_from_pydict(cls): table = cls.from_pydict({}) assert table.num_columns == 0 assert table.num_rows == 0 assert table.schema == pa.schema([]) assert table.to_pydict() == {} schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) # With lists as values data = OrderedDict([('strs', ['', 'foo', 'bar']), ('floats', [4.5, 5, None])]) table = cls.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema assert table.to_pydict() == data # With metadata and inferred schema metadata = {b'foo': b'bar'} schema = schema.with_metadata(metadata) table = cls.from_pydict(data, metadata=metadata) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # With explicit schema table = cls.from_pydict(data, schema=schema) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pydict() == data # Cannot pass both schema and metadata with pytest.raises(ValueError): cls.from_pydict(data, schema=schema, metadata=metadata) # Non-convertible values given schema with pytest.raises(TypeError): cls.from_pydict({'c0': [0, 1, 2]}, schema=pa.schema([("c0", pa.string())])) # Missing schema fields from the passed mapping with pytest.raises(KeyError, match="doesn\'t contain.* c, d"): cls.from_pydict( {'a': [1, 2, 3], 'b': [3, 4, 5]}, schema=pa.schema([ ('a', pa.int64()), ('c', pa.int32()), ('d', pa.int16()) ]) ) # Passed wrong schema type with pytest.raises(TypeError): cls.from_pydict({'a': [1, 2, 3]}, schema={}) @pytest.mark.parametrize('data, klass', [ ((['', 'foo', 'bar'], [4.5, 5, None]), pa.array), (([[''], ['foo', 'bar']], [[4.5], [5., None]]), pa.chunked_array), ]) def test_table_from_pydict_arrow_arrays(data, klass): data = OrderedDict([('strs', klass(data[0])), ('floats', klass(data[1]))]) schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) # With arrays as values table = pa.Table.from_pydict(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # With explicit (matching) schema table = pa.Table.from_pydict(data, schema=schema) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # with different but compatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_pydict(data, schema=schema) assert pa.types.is_float32(table.column('floats').type) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): pa.Table.from_pydict(data, schema=schema) @pytest.mark.parametrize('data, klass', [ ((['', 'foo', 'bar'], [4.5, 5, None]), list), ((['', 'foo', 'bar'], [4.5, 5, None]), pa.array), (([[''], ['foo', 'bar']], [[4.5], [5., None]]), pa.chunked_array), ]) def test_table_from_pydict_schema(data, klass): # passed schema is source of truth for the columns data = OrderedDict([('strs', klass(data[0])), ('floats', klass(data[1]))]) # schema has columns not present in data -> error schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()), ('ints', pa.int64())]) with pytest.raises(KeyError, match='ints'): pa.Table.from_pydict(data, schema=schema) # data has columns not present in schema -> ignored schema = pa.schema([('strs', pa.utf8())]) table = pa.Table.from_pydict(data, schema=schema) assert table.num_columns == 1 assert table.schema == schema assert table.column_names == ['strs'] @pytest.mark.parametrize( ('cls'), [ (pa.Table), (pa.RecordBatch) ] ) def test_table_from_pylist(cls): table = cls.from_pylist([]) assert table.num_columns == 0 assert table.num_rows == 0 assert table.schema == pa.schema([]) assert table.to_pylist() == [] schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64())]) # With lists as values data = [{'strs': '', 'floats': 4.5}, {'strs': 'foo', 'floats': 5}, {'strs': 'bar', 'floats': None}] table = cls.from_pylist(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.schema == schema assert table.to_pylist() == data # With metadata and inferred schema metadata = {b'foo': b'bar'} schema = schema.with_metadata(metadata) table = cls.from_pylist(data, metadata=metadata) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pylist() == data # With explicit schema table = cls.from_pylist(data, schema=schema) assert table.schema == schema assert table.schema.metadata == metadata assert table.to_pylist() == data # Cannot pass both schema and metadata with pytest.raises(ValueError): cls.from_pylist(data, schema=schema, metadata=metadata) # Non-convertible values given schema with pytest.raises(TypeError): cls.from_pylist([{'c0': 0}, {'c0': 1}, {'c0': 2}], schema=pa.schema([("c0", pa.string())])) # Missing schema fields in the passed mapping translate to None schema = pa.schema([('a', pa.int64()), ('c', pa.int32()), ('d', pa.int16()) ]) table = cls.from_pylist( [{'a': 1, 'b': 3}, {'a': 2, 'b': 4}, {'a': 3, 'b': 5}], schema=schema ) data = [{'a': 1, 'c': None, 'd': None}, {'a': 2, 'c': None, 'd': None}, {'a': 3, 'c': None, 'd': None}] assert table.schema == schema assert table.to_pylist() == data # Passed wrong schema type with pytest.raises(TypeError): cls.from_pylist([{'a': 1}, {'a': 2}, {'a': 3}], schema={}) # If the dictionaries of rows are not same length data = [{'strs': '', 'floats': 4.5}, {'floats': 5}, {'strs': 'bar'}] data2 = [{'strs': '', 'floats': 4.5}, {'strs': None, 'floats': 5}, {'strs': 'bar', 'floats': None}] table = cls.from_pylist(data) assert table.num_columns == 2 assert table.num_rows == 3 assert table.to_pylist() == data2 data = [{'strs': ''}, {'strs': 'foo', 'floats': 5}, {'floats': None}] data2 = [{'strs': ''}, {'strs': 'foo'}, {'strs': None}] table = cls.from_pylist(data) assert table.num_columns == 1 assert table.num_rows == 3 assert table.to_pylist() == data2 @pytest.mark.pandas def test_table_from_pandas_schema(): # passed schema is source of truth for the columns import pandas as pd df = pd.DataFrame(OrderedDict([('strs', ['', 'foo', 'bar']), ('floats', [4.5, 5, None])])) # with different but compatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float32())]) table = pa.Table.from_pandas(df, schema=schema) assert pa.types.is_float32(table.column('floats').type) assert table.schema.remove_metadata() == schema # with different and incompatible schema schema = pa.schema([('strs', pa.utf8()), ('floats', pa.timestamp('s'))]) with pytest.raises((NotImplementedError, TypeError)): pa.Table.from_pandas(df, schema=schema) # schema has columns not present in data -> error schema = pa.schema([('strs', pa.utf8()), ('floats', pa.float64()), ('ints', pa.int64())]) with pytest.raises(KeyError, match='ints'): pa.Table.from_pandas(df, schema=schema) # data has columns not present in schema -> ignored schema = pa.schema([('strs', pa.utf8())]) table = pa.Table.from_pandas(df, schema=schema) assert table.num_columns == 1 assert table.schema.remove_metadata() == schema assert table.column_names == ['strs'] @pytest.mark.pandas def test_table_factory_function(): import pandas as pd # Put in wrong order to make sure that lines up with schema d = OrderedDict([('b', ['a', 'b', 'c']), ('a', [1, 2, 3])]) d_explicit = {'b': pa.array(['a', 'b', 'c'], type='string'), 'a': pa.array([1, 2, 3], type='int32')} schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) df = pd.DataFrame(d) table1 = pa.table(df) table2 = pa.Table.from_pandas(df) assert table1.equals(table2) table1 = pa.table(df, schema=schema) table2 = pa.Table.from_pandas(df, schema=schema) assert table1.equals(table2) table1 = pa.table(d_explicit) table2 = pa.Table.from_pydict(d_explicit) assert table1.equals(table2) # schema coerces type table1 = pa.table(d, schema=schema) table2 = pa.Table.from_pydict(d, schema=schema) assert table1.equals(table2) def test_table_factory_function_args(): # from_pydict not accepting names: with pytest.raises(ValueError): pa.table({'a': [1, 2, 3]}, names=['a']) # backwards compatibility for schema as first positional argument schema = pa.schema([('a', pa.int32())]) table = pa.table({'a': pa.array([1, 2, 3], type=pa.int64())}, schema) assert table.column('a').type == pa.int32() # from_arrays: accept both names and schema as positional first argument data = [pa.array([1, 2, 3], type='int64')] names = ['a'] table = pa.table(data, names) assert table.column_names == names schema = pa.schema([('a', pa.int64())]) table = pa.table(data, schema) assert table.column_names == names @pytest.mark.pandas def test_table_factory_function_args_pandas(): import pandas as pd # from_pandas not accepting names or metadata: with pytest.raises(ValueError): pa.table(pd.DataFrame({'a': [1, 2, 3]}), names=['a']) with pytest.raises(ValueError): pa.table(pd.DataFrame({'a': [1, 2, 3]}), metadata={b'foo': b'bar'}) # backwards compatibility for schema as first positional argument schema = pa.schema([('a', pa.int32())]) table = pa.table(pd.DataFrame({'a': [1, 2, 3]}), schema) assert table.column('a').type == pa.int32() def test_factory_functions_invalid_input(): with pytest.raises(TypeError, match="Expected pandas DataFrame, python"): pa.table("invalid input") with pytest.raises(TypeError, match="Expected pandas DataFrame"): pa.record_batch("invalid input") def test_table_repr_to_string(): # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16(), metadata={'key': 'value'}), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) tab = pa.table([pa.array([1, 2, 3, 4], type='int16'), pa.array([10, 20, 30, 40], type='int32')], schema=schema) assert str(tab) == """pyarrow.Table c0: int16 c1: int32 ---- c0: [[1,2,3,4]] c1: [[10,20,30,40]]""" assert tab.to_string(show_metadata=True) == """\ pyarrow.Table c0: int16 -- field metadata -- key: 'value' c1: int32 -- schema metadata -- foo: 'bar'""" assert tab.to_string(preview_cols=5) == """\ pyarrow.Table c0: int16 c1: int32 ---- c0: [[1,2,3,4]] c1: [[10,20,30,40]]""" assert tab.to_string(preview_cols=1) == """\ pyarrow.Table c0: int16 c1: int32 ---- c0: [[1,2,3,4]] ...""" def test_table_repr_to_string_ellipsis(): # Schema passed explicitly schema = pa.schema([pa.field('c0', pa.int16(), metadata={'key': 'value'}), pa.field('c1', pa.int32())], metadata={b'foo': b'bar'}) tab = pa.table([pa.array([1, 2, 3, 4]*10, type='int16'), pa.array([10, 20, 30, 40]*10, type='int32')], schema=schema) assert str(tab) == """pyarrow.Table c0: int16 c1: int32 ---- c0: [[1,2,3,4,1,...,4,1,2,3,4]] c1: [[10,20,30,40,10,...,40,10,20,30,40]]""" def test_table_function_unicode_schema(): col_a = "äääh" col_b = "öööf" # Put in wrong order to make sure that lines up with schema d = OrderedDict([(col_b, ['a', 'b', 'c']), (col_a, [1, 2, 3])]) schema = pa.schema([(col_a, pa.int32()), (col_b, pa.string())]) result = pa.table(d, schema=schema) assert result[0].chunk(0).equals(pa.array([1, 2, 3], type='int32')) assert result[1].chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) def test_table_take_vanilla_functionality(): table = pa.table( [pa.array([1, 2, 3, None, 5]), pa.array(['a', 'b', 'c', 'd', 'e'])], ['f1', 'f2']) assert table.take(pa.array([2, 3])).equals(table.slice(2, 2)) def test_table_take_null_index(): table = pa.table( [pa.array([1, 2, 3, None, 5]), pa.array(['a', 'b', 'c', 'd', 'e'])], ['f1', 'f2']) result_with_null_index = pa.table( [pa.array([1, None]), pa.array(['a', None])], ['f1', 'f2']) assert table.take(pa.array([0, None])).equals(result_with_null_index) def test_table_take_non_consecutive(): table = pa.table( [pa.array([1, 2, 3, None, 5]), pa.array(['a', 'b', 'c', 'd', 'e'])], ['f1', 'f2']) result_non_consecutive = pa.table( [pa.array([2, None]), pa.array(['b', 'd'])], ['f1', 'f2']) assert table.take(pa.array([1, 3])).equals(result_non_consecutive) def test_table_select(): a1 = pa.array([1, 2, 3, None, 5]) a2 = pa.array(['a', 'b', 'c', 'd', 'e']) a3 = pa.array([[1, 2], [3, 4], [5, 6], None, [9, 10]]) table = pa.table([a1, a2, a3], ['f1', 'f2', 'f3']) # selecting with string names result = table.select(['f1']) expected = pa.table([a1], ['f1']) assert result.equals(expected) result = table.select(['f3', 'f2']) expected = pa.table([a3, a2], ['f3', 'f2']) assert result.equals(expected) # selecting with integer indices result = table.select([0]) expected = pa.table([a1], ['f1']) assert result.equals(expected) result = table.select([2, 1]) expected = pa.table([a3, a2], ['f3', 'f2']) assert result.equals(expected) # preserve metadata table2 = table.replace_schema_metadata({"a": "test"}) result = table2.select(["f1", "f2"]) assert b"a" in result.schema.metadata # selecting non-existing column raises with pytest.raises(KeyError, match='Field "f5" does not exist'): table.select(['f5']) with pytest.raises(IndexError, match="index out of bounds"): table.select([5]) # duplicate selection gives duplicated names in resulting table result = table.select(['f2', 'f2']) expected = pa.table([a2, a2], ['f2', 'f2']) assert result.equals(expected) # selection duplicated column raises table = pa.table([a1, a2, a3], ['f1', 'f2', 'f1']) with pytest.raises(KeyError, match='Field "f1" exists 2 times'): table.select(['f1']) result = table.select(['f2']) expected = pa.table([a2], ['f2']) assert result.equals(expected) def test_table_group_by(): def sorted_by_keys(d): # Ensure a guaranteed order of keys for aggregation results. if "keys2" in d: keys = tuple(zip(d["keys"], d["keys2"])) else: keys = d["keys"] sorted_keys = sorted(keys) sorted_d = {"keys": sorted(d["keys"])} for entry in d: if entry == "keys": continue values = dict(zip(keys, d[entry])) for k in sorted_keys: sorted_d.setdefault(entry, []).append(values[k]) return sorted_d table = pa.table([ pa.array(["a", "a", "b", "b", "c"]), pa.array(["X", "X", "Y", "Z", "Z"]), pa.array([1, 2, 3, 4, 5]), pa.array([10, 20, 30, 40, 50]) ], names=["keys", "keys2", "values", "bigvalues"]) r = table.group_by("keys").aggregate([ ("values", "hash_sum") ]) assert sorted_by_keys(r.to_pydict()) == { "keys": ["a", "b", "c"], "values_sum": [3, 7, 5] } r = table.group_by("keys").aggregate([ ("values", "hash_sum"), ("values", "hash_count") ]) assert sorted_by_keys(r.to_pydict()) == { "keys": ["a", "b", "c"], "values_sum": [3, 7, 5], "values_count": [2, 2, 1] } # Test without hash_ prefix r = table.group_by("keys").aggregate([ ("values", "sum") ]) assert sorted_by_keys(r.to_pydict()) == { "keys": ["a", "b", "c"], "values_sum": [3, 7, 5] } r = table.group_by("keys").aggregate([ ("values", "max"), ("bigvalues", "sum") ]) assert sorted_by_keys(r.to_pydict()) == { "keys": ["a", "b", "c"], "values_max": [2, 4, 5], "bigvalues_sum": [30, 70, 50] } r = table.group_by("keys").aggregate([ ("bigvalues", "max"), ("values", "sum") ]) assert sorted_by_keys(r.to_pydict()) == { "keys": ["a", "b", "c"], "values_sum": [3, 7, 5], "bigvalues_max": [20, 40, 50] } r = table.group_by(["keys", "keys2"]).aggregate([ ("values", "sum") ]) assert sorted_by_keys(r.to_pydict()) == { "keys": ["a", "b", "b", "c"], "keys2": ["X", "Y", "Z", "Z"], "values_sum": [3, 3, 4, 5] } table_with_nulls = pa.table([ pa.array(["a", "a", "a"]), pa.array([1, None, None]) ], names=["keys", "values"]) r = table_with_nulls.group_by(["keys"]).aggregate([ ("values", "count", pc.CountOptions(mode="all")) ]) assert r.to_pydict() == { "keys": ["a"], "values_count": [3] } r = table_with_nulls.group_by(["keys"]).aggregate([ ("values", "count", pc.CountOptions(mode="only_null")) ]) assert r.to_pydict() == { "keys": ["a"], "values_count": [2] } r = table_with_nulls.group_by(["keys"]).aggregate([ ("values", "count", pc.CountOptions(mode="only_valid")) ]) assert r.to_pydict() == { "keys": ["a"], "values_count": [1] } def test_table_sort_by(): table = pa.table([ pa.array([3, 1, 4, 2, 5]), pa.array(["b", "a", "b", "a", "c"]), ], names=["values", "keys"]) assert table.sort_by("values").to_pydict() == { "keys": ["a", "a", "b", "b", "c"], "values": [1, 2, 3, 4, 5] } assert table.sort_by([("values", "descending")]).to_pydict() == { "keys": ["c", "b", "b", "a", "a"], "values": [5, 4, 3, 2, 1] } def test_table_to_recordbatchreader(): table = pa.Table.from_pydict({'x': [1, 2, 3]}) reader = table.to_reader() assert table.schema == reader.schema assert table == reader.read_all() reader = table.to_reader(max_chunksize=2) assert reader.read_next_batch().num_rows == 2 assert reader.read_next_batch().num_rows == 1 @pytest.mark.dataset def test_table_join(): t1 = pa.table({ "colA": [1, 2, 6], "col2": ["a", "b", "f"] }) t2 = pa.table({ "colB": [99, 2, 1], "col3": ["Z", "B", "A"] }) result = t1.join(t2, "colA", "colB") assert result.combine_chunks() == pa.table({ "colA": [1, 2, 6], "col2": ["a", "b", "f"], "col3": ["A", "B", None] }) result = t1.join(t2, "colA", "colB", join_type="full outer") assert result.combine_chunks().sort_by("colA") == pa.table({ "colA": [1, 2, 6, 99], "col2": ["a", "b", "f", None], "col3": ["A", "B", None, "Z"] }) @pytest.mark.dataset def test_table_join_unique_key(): t1 = pa.table({ "colA": [1, 2, 6], "col2": ["a", "b", "f"] }) t2 = pa.table({ "colA": [99, 2, 1], "col3": ["Z", "B", "A"] }) result = t1.join(t2, "colA") assert result.combine_chunks() == pa.table({ "colA": [1, 2, 6], "col2": ["a", "b", "f"], "col3": ["A", "B", None] }) result = t1.join(t2, "colA", join_type="full outer", right_suffix="_r") assert result.combine_chunks().sort_by("colA") == pa.table({ "colA": [1, 2, 6, 99], "col2": ["a", "b", "f", None], "col3": ["A", "B", None, "Z"] }) @pytest.mark.dataset def test_table_join_collisions(): t1 = pa.table({ "colA": [1, 2, 6], "colB": [10, 20, 60], "colVals": ["a", "b", "f"] }) t2 = pa.table({ "colA": [99, 2, 1], "colB": [99, 20, 10], "colVals": ["Z", "B", "A"] }) result = t1.join(t2, "colA", join_type="full outer") assert result.combine_chunks().sort_by("colA") == pa.table([ [1, 2, 6, 99], [10, 20, 60, None], ["a", "b", "f", None], [10, 20, None, 99], ["A", "B", None, "Z"], ], names=["colA", "colB", "colVals", "colB", "colVals"])