mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-04-22 02:23:48 +00:00
1923 lines
53 KiB
Python
1923 lines
53 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
||
# or more contributor license agreements. See the NOTICE file
|
||
# distributed with this work for additional information
|
||
# regarding copyright ownership. The ASF licenses this file
|
||
# to you under the Apache License, Version 2.0 (the
|
||
# "License"); you may not use this file except in compliance
|
||
# with the License. You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing,
|
||
# software distributed under the License is distributed on an
|
||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
# KIND, either express or implied. See the License for the
|
||
# specific language governing permissions and limitations
|
||
# under the License.
|
||
|
||
import bz2
|
||
from contextlib import contextmanager
|
||
from io import (BytesIO, StringIO, TextIOWrapper, BufferedIOBase, IOBase)
|
||
import itertools
|
||
import gc
|
||
import gzip
|
||
import os
|
||
import pathlib
|
||
import pickle
|
||
import pytest
|
||
import sys
|
||
import tempfile
|
||
import weakref
|
||
|
||
import numpy as np
|
||
|
||
from pyarrow.util import guid
|
||
from pyarrow import Codec
|
||
import pyarrow as pa
|
||
|
||
|
||
def check_large_seeks(file_factory):
|
||
if sys.platform in ('win32', 'darwin'):
|
||
pytest.skip("need sparse file support")
|
||
try:
|
||
filename = tempfile.mktemp(prefix='test_io')
|
||
with open(filename, 'wb') as f:
|
||
f.truncate(2 ** 32 + 10)
|
||
f.seek(2 ** 32 + 5)
|
||
f.write(b'mark\n')
|
||
with file_factory(filename) as f:
|
||
assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
|
||
assert f.tell() == 2 ** 32 + 5
|
||
assert f.read(5) == b'mark\n'
|
||
assert f.tell() == 2 ** 32 + 10
|
||
finally:
|
||
os.unlink(filename)
|
||
|
||
|
||
@contextmanager
|
||
def assert_file_not_found():
|
||
with pytest.raises(FileNotFoundError):
|
||
yield
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Python file-like objects
|
||
|
||
|
||
def test_python_file_write():
|
||
buf = BytesIO()
|
||
|
||
f = pa.PythonFile(buf)
|
||
|
||
assert f.tell() == 0
|
||
|
||
s1 = b'enga\xc3\xb1ado'
|
||
s2 = b'foobar'
|
||
|
||
f.write(s1)
|
||
assert f.tell() == len(s1)
|
||
|
||
f.write(s2)
|
||
|
||
expected = s1 + s2
|
||
|
||
result = buf.getvalue()
|
||
assert result == expected
|
||
|
||
assert not f.closed
|
||
f.close()
|
||
assert f.closed
|
||
|
||
with pytest.raises(TypeError, match="binary file expected"):
|
||
pa.PythonFile(StringIO())
|
||
|
||
|
||
def test_python_file_read():
|
||
data = b'some sample data'
|
||
|
||
buf = BytesIO(data)
|
||
f = pa.PythonFile(buf, mode='r')
|
||
|
||
assert f.size() == len(data)
|
||
|
||
assert f.tell() == 0
|
||
|
||
assert f.read(4) == b'some'
|
||
assert f.tell() == 4
|
||
|
||
f.seek(0)
|
||
assert f.tell() == 0
|
||
|
||
f.seek(5)
|
||
assert f.tell() == 5
|
||
|
||
v = f.read(50)
|
||
assert v == b'sample data'
|
||
assert len(v) == 11
|
||
|
||
assert f.size() == len(data)
|
||
|
||
assert not f.closed
|
||
f.close()
|
||
assert f.closed
|
||
|
||
with pytest.raises(TypeError, match="binary file expected"):
|
||
pa.PythonFile(StringIO(), mode='r')
|
||
|
||
|
||
def test_python_file_read_at():
|
||
data = b'some sample data'
|
||
|
||
buf = BytesIO(data)
|
||
f = pa.PythonFile(buf, mode='r')
|
||
|
||
# test simple read at
|
||
v = f.read_at(nbytes=5, offset=3)
|
||
assert v == b'e sam'
|
||
assert len(v) == 5
|
||
|
||
# test reading entire file when nbytes > len(file)
|
||
w = f.read_at(nbytes=50, offset=0)
|
||
assert w == data
|
||
assert len(w) == 16
|
||
|
||
|
||
def test_python_file_readall():
|
||
data = b'some sample data'
|
||
|
||
buf = BytesIO(data)
|
||
with pa.PythonFile(buf, mode='r') as f:
|
||
assert f.readall() == data
|
||
|
||
|
||
def test_python_file_readinto():
|
||
length = 10
|
||
data = b'some sample data longer than 10'
|
||
dst_buf = bytearray(length)
|
||
src_buf = BytesIO(data)
|
||
|
||
with pa.PythonFile(src_buf, mode='r') as f:
|
||
assert f.readinto(dst_buf) == 10
|
||
|
||
assert dst_buf[:length] == data[:length]
|
||
assert len(dst_buf) == length
|
||
|
||
|
||
def test_python_file_read_buffer():
|
||
length = 10
|
||
data = b'0123456798'
|
||
dst_buf = bytearray(data)
|
||
|
||
class DuckReader:
|
||
def close(self):
|
||
pass
|
||
|
||
@property
|
||
def closed(self):
|
||
return False
|
||
|
||
def read_buffer(self, nbytes):
|
||
assert nbytes == length
|
||
return memoryview(dst_buf)[:nbytes]
|
||
|
||
duck_reader = DuckReader()
|
||
with pa.PythonFile(duck_reader, mode='r') as f:
|
||
buf = f.read_buffer(length)
|
||
assert len(buf) == length
|
||
assert memoryview(buf).tobytes() == dst_buf[:length]
|
||
# buf should point to the same memory, so modyfing it
|
||
memoryview(buf)[0] = ord(b'x')
|
||
# should modify the original
|
||
assert dst_buf[0] == ord(b'x')
|
||
|
||
|
||
def test_python_file_correct_abc():
|
||
with pa.PythonFile(BytesIO(b''), mode='r') as f:
|
||
assert isinstance(f, BufferedIOBase)
|
||
assert isinstance(f, IOBase)
|
||
|
||
|
||
def test_python_file_iterable():
|
||
data = b'''line1
|
||
line2
|
||
line3
|
||
'''
|
||
|
||
buf = BytesIO(data)
|
||
buf2 = BytesIO(data)
|
||
|
||
with pa.PythonFile(buf, mode='r') as f:
|
||
for read, expected in zip(f, buf2):
|
||
assert read == expected
|
||
|
||
|
||
def test_python_file_large_seeks():
|
||
def factory(filename):
|
||
return pa.PythonFile(open(filename, 'rb'))
|
||
|
||
check_large_seeks(factory)
|
||
|
||
|
||
def test_bytes_reader():
|
||
# Like a BytesIO, but zero-copy underneath for C++ consumers
|
||
data = b'some sample data'
|
||
f = pa.BufferReader(data)
|
||
assert f.tell() == 0
|
||
|
||
assert f.size() == len(data)
|
||
|
||
assert f.read(4) == b'some'
|
||
assert f.tell() == 4
|
||
|
||
f.seek(0)
|
||
assert f.tell() == 0
|
||
|
||
f.seek(0, 2)
|
||
assert f.tell() == len(data)
|
||
|
||
f.seek(5)
|
||
assert f.tell() == 5
|
||
|
||
assert f.read(50) == b'sample data'
|
||
|
||
assert not f.closed
|
||
f.close()
|
||
assert f.closed
|
||
|
||
|
||
def test_bytes_reader_non_bytes():
|
||
with pytest.raises(TypeError):
|
||
pa.BufferReader('some sample data')
|
||
|
||
|
||
def test_bytes_reader_retains_parent_reference():
|
||
import gc
|
||
|
||
# ARROW-421
|
||
def get_buffer():
|
||
data = b'some sample data' * 1000
|
||
reader = pa.BufferReader(data)
|
||
reader.seek(5)
|
||
return reader.read_buffer(6)
|
||
|
||
buf = get_buffer()
|
||
gc.collect()
|
||
assert buf.to_pybytes() == b'sample'
|
||
assert buf.parent is not None
|
||
|
||
|
||
def test_python_file_implicit_mode(tmpdir):
|
||
path = os.path.join(str(tmpdir), 'foo.txt')
|
||
with open(path, 'wb') as f:
|
||
pf = pa.PythonFile(f)
|
||
assert pf.writable()
|
||
assert not pf.readable()
|
||
assert not pf.seekable() # PyOutputStream isn't seekable
|
||
f.write(b'foobar\n')
|
||
|
||
with open(path, 'rb') as f:
|
||
pf = pa.PythonFile(f)
|
||
assert pf.readable()
|
||
assert not pf.writable()
|
||
assert pf.seekable()
|
||
assert pf.read() == b'foobar\n'
|
||
|
||
bio = BytesIO()
|
||
pf = pa.PythonFile(bio)
|
||
assert pf.writable()
|
||
assert not pf.readable()
|
||
assert not pf.seekable()
|
||
pf.write(b'foobar\n')
|
||
assert bio.getvalue() == b'foobar\n'
|
||
|
||
|
||
def test_python_file_writelines(tmpdir):
|
||
lines = [b'line1\n', b'line2\n' b'line3']
|
||
path = os.path.join(str(tmpdir), 'foo.txt')
|
||
with open(path, 'wb') as f:
|
||
try:
|
||
f = pa.PythonFile(f, mode='w')
|
||
assert f.writable()
|
||
f.writelines(lines)
|
||
finally:
|
||
f.close()
|
||
|
||
with open(path, 'rb') as f:
|
||
try:
|
||
f = pa.PythonFile(f, mode='r')
|
||
assert f.readable()
|
||
assert f.read() == b''.join(lines)
|
||
finally:
|
||
f.close()
|
||
|
||
|
||
def test_python_file_closing():
|
||
bio = BytesIO()
|
||
pf = pa.PythonFile(bio)
|
||
wr = weakref.ref(pf)
|
||
del pf
|
||
assert wr() is None # object was destroyed
|
||
assert not bio.closed
|
||
pf = pa.PythonFile(bio)
|
||
pf.close()
|
||
assert bio.closed
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Buffers
|
||
|
||
|
||
def check_buffer_pickling(buf):
|
||
# Check that buffer survives a pickle roundtrip
|
||
for protocol in range(0, pickle.HIGHEST_PROTOCOL + 1):
|
||
result = pickle.loads(pickle.dumps(buf, protocol=protocol))
|
||
assert len(result) == len(buf)
|
||
assert memoryview(result) == memoryview(buf)
|
||
assert result.to_pybytes() == buf.to_pybytes()
|
||
assert result.is_mutable == buf.is_mutable
|
||
|
||
|
||
def test_buffer_bytes():
|
||
val = b'some data'
|
||
|
||
buf = pa.py_buffer(val)
|
||
assert isinstance(buf, pa.Buffer)
|
||
assert not buf.is_mutable
|
||
assert buf.is_cpu
|
||
|
||
result = buf.to_pybytes()
|
||
assert result == val
|
||
|
||
check_buffer_pickling(buf)
|
||
|
||
|
||
def test_buffer_null_data():
|
||
null_buff = pa.foreign_buffer(address=0, size=0)
|
||
assert null_buff.to_pybytes() == b""
|
||
assert null_buff.address == 0
|
||
# ARROW-16048: we shouldn't expose a NULL address through the Python
|
||
# buffer protocol.
|
||
m = memoryview(null_buff)
|
||
assert m.tobytes() == b""
|
||
assert pa.py_buffer(m).address != 0
|
||
|
||
check_buffer_pickling(null_buff)
|
||
|
||
|
||
def test_buffer_memoryview():
|
||
val = b'some data'
|
||
|
||
buf = pa.py_buffer(val)
|
||
assert isinstance(buf, pa.Buffer)
|
||
assert not buf.is_mutable
|
||
assert buf.is_cpu
|
||
|
||
result = memoryview(buf)
|
||
assert result == val
|
||
|
||
check_buffer_pickling(buf)
|
||
|
||
|
||
def test_buffer_bytearray():
|
||
val = bytearray(b'some data')
|
||
|
||
buf = pa.py_buffer(val)
|
||
assert isinstance(buf, pa.Buffer)
|
||
assert buf.is_mutable
|
||
assert buf.is_cpu
|
||
|
||
result = bytearray(buf)
|
||
assert result == val
|
||
|
||
check_buffer_pickling(buf)
|
||
|
||
|
||
def test_buffer_invalid():
|
||
with pytest.raises(TypeError,
|
||
match="(bytes-like object|buffer interface)"):
|
||
pa.py_buffer(None)
|
||
|
||
|
||
def test_buffer_weakref():
|
||
buf = pa.py_buffer(b'some data')
|
||
wr = weakref.ref(buf)
|
||
assert wr() is not None
|
||
del buf
|
||
assert wr() is None
|
||
|
||
|
||
@pytest.mark.parametrize('val, expected_hex_buffer',
|
||
[(b'check', b'636865636B'),
|
||
(b'\a0', b'0730'),
|
||
(b'', b'')])
|
||
def test_buffer_hex(val, expected_hex_buffer):
|
||
buf = pa.py_buffer(val)
|
||
assert buf.hex() == expected_hex_buffer
|
||
|
||
|
||
def test_buffer_to_numpy():
|
||
# Make sure creating a numpy array from an arrow buffer works
|
||
byte_array = bytearray(20)
|
||
byte_array[0] = 42
|
||
buf = pa.py_buffer(byte_array)
|
||
array = np.frombuffer(buf, dtype="uint8")
|
||
assert array[0] == byte_array[0]
|
||
byte_array[0] += 1
|
||
assert array[0] == byte_array[0]
|
||
assert array.base == buf
|
||
|
||
|
||
def test_buffer_from_numpy():
|
||
# C-contiguous
|
||
arr = np.arange(12, dtype=np.int8).reshape((3, 4))
|
||
buf = pa.py_buffer(arr)
|
||
assert buf.is_cpu
|
||
assert buf.is_mutable
|
||
assert buf.to_pybytes() == arr.tobytes()
|
||
# F-contiguous; note strides information is lost
|
||
buf = pa.py_buffer(arr.T)
|
||
assert buf.is_cpu
|
||
assert buf.is_mutable
|
||
assert buf.to_pybytes() == arr.tobytes()
|
||
# Non-contiguous
|
||
with pytest.raises(ValueError, match="not contiguous"):
|
||
buf = pa.py_buffer(arr.T[::2])
|
||
|
||
|
||
def test_buffer_address():
|
||
b1 = b'some data!'
|
||
b2 = bytearray(b1)
|
||
b3 = bytearray(b1)
|
||
|
||
buf1 = pa.py_buffer(b1)
|
||
buf2 = pa.py_buffer(b1)
|
||
buf3 = pa.py_buffer(b2)
|
||
buf4 = pa.py_buffer(b3)
|
||
|
||
assert buf1.address > 0
|
||
assert buf1.address == buf2.address
|
||
assert buf3.address != buf2.address
|
||
assert buf4.address != buf3.address
|
||
|
||
arr = np.arange(5)
|
||
buf = pa.py_buffer(arr)
|
||
assert buf.address == arr.ctypes.data
|
||
|
||
|
||
def test_buffer_equals():
|
||
# Buffer.equals() returns true iff the buffers have the same contents
|
||
def eq(a, b):
|
||
assert a.equals(b)
|
||
assert a == b
|
||
assert not (a != b)
|
||
|
||
def ne(a, b):
|
||
assert not a.equals(b)
|
||
assert not (a == b)
|
||
assert a != b
|
||
|
||
b1 = b'some data!'
|
||
b2 = bytearray(b1)
|
||
b3 = bytearray(b1)
|
||
b3[0] = 42
|
||
buf1 = pa.py_buffer(b1)
|
||
buf2 = pa.py_buffer(b2)
|
||
buf3 = pa.py_buffer(b2)
|
||
buf4 = pa.py_buffer(b3)
|
||
buf5 = pa.py_buffer(np.frombuffer(b2, dtype=np.int16))
|
||
eq(buf1, buf1)
|
||
eq(buf1, buf2)
|
||
eq(buf2, buf3)
|
||
ne(buf2, buf4)
|
||
# Data type is indifferent
|
||
eq(buf2, buf5)
|
||
|
||
|
||
def test_buffer_eq_bytes():
|
||
buf = pa.py_buffer(b'some data')
|
||
assert buf == b'some data'
|
||
assert buf == bytearray(b'some data')
|
||
assert buf != b'some dat1'
|
||
|
||
with pytest.raises(TypeError):
|
||
buf == 'some data'
|
||
|
||
|
||
def test_buffer_getitem():
|
||
data = bytearray(b'some data!')
|
||
buf = pa.py_buffer(data)
|
||
|
||
n = len(data)
|
||
for ix in range(-n, n - 1):
|
||
assert buf[ix] == data[ix]
|
||
|
||
with pytest.raises(IndexError):
|
||
buf[n]
|
||
|
||
with pytest.raises(IndexError):
|
||
buf[-n - 1]
|
||
|
||
|
||
def test_buffer_slicing():
|
||
data = b'some data!'
|
||
buf = pa.py_buffer(data)
|
||
|
||
sliced = buf.slice(2)
|
||
expected = pa.py_buffer(b'me data!')
|
||
assert sliced.equals(expected)
|
||
|
||
sliced2 = buf.slice(2, 4)
|
||
expected2 = pa.py_buffer(b'me d')
|
||
assert sliced2.equals(expected2)
|
||
|
||
# 0 offset
|
||
assert buf.slice(0).equals(buf)
|
||
|
||
# Slice past end of buffer
|
||
assert len(buf.slice(len(buf))) == 0
|
||
|
||
with pytest.raises(IndexError):
|
||
buf.slice(-1)
|
||
|
||
with pytest.raises(IndexError):
|
||
buf.slice(len(buf) + 1)
|
||
assert buf[11:].to_pybytes() == b""
|
||
|
||
# Slice stop exceeds buffer length
|
||
with pytest.raises(IndexError):
|
||
buf.slice(1, len(buf))
|
||
assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:]
|
||
|
||
# Negative length
|
||
with pytest.raises(IndexError):
|
||
buf.slice(1, -1)
|
||
|
||
# Test slice notation
|
||
assert buf[2:].equals(buf.slice(2))
|
||
assert buf[2:5].equals(buf.slice(2, 3))
|
||
assert buf[-5:].equals(buf.slice(len(buf) - 5))
|
||
assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3))
|
||
|
||
with pytest.raises(IndexError):
|
||
buf[::-1]
|
||
with pytest.raises(IndexError):
|
||
buf[::2]
|
||
|
||
n = len(buf)
|
||
for start in range(-n * 2, n * 2):
|
||
for stop in range(-n * 2, n * 2):
|
||
assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop]
|
||
|
||
|
||
def test_buffer_hashing():
|
||
# Buffers are unhashable
|
||
with pytest.raises(TypeError, match="unhashable"):
|
||
hash(pa.py_buffer(b'123'))
|
||
|
||
|
||
def test_buffer_protocol_respects_immutability():
|
||
# ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like
|
||
# object is mutable by first attempting to get a mutable buffer using
|
||
# PyObject_FromBuffer. If that fails, it assumes that the object is
|
||
# immutable
|
||
a = b'12345'
|
||
arrow_ref = pa.py_buffer(a)
|
||
numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8)
|
||
assert not numpy_ref.flags.writeable
|
||
|
||
|
||
def test_foreign_buffer():
|
||
obj = np.array([1, 2], dtype=np.int32)
|
||
addr = obj.__array_interface__["data"][0]
|
||
size = obj.nbytes
|
||
buf = pa.foreign_buffer(addr, size, obj)
|
||
wr = weakref.ref(obj)
|
||
del obj
|
||
assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2]
|
||
assert wr() is not None
|
||
del buf
|
||
assert wr() is None
|
||
|
||
|
||
def test_allocate_buffer():
|
||
buf = pa.allocate_buffer(100)
|
||
assert buf.size == 100
|
||
assert buf.is_mutable
|
||
assert buf.parent is None
|
||
|
||
bit = b'abcde'
|
||
writer = pa.FixedSizeBufferWriter(buf)
|
||
writer.write(bit)
|
||
|
||
assert buf.to_pybytes()[:5] == bit
|
||
|
||
|
||
def test_allocate_buffer_resizable():
|
||
buf = pa.allocate_buffer(100, resizable=True)
|
||
assert isinstance(buf, pa.ResizableBuffer)
|
||
|
||
buf.resize(200)
|
||
assert buf.size == 200
|
||
|
||
|
||
@pytest.mark.parametrize("compression", [
|
||
pytest.param(
|
||
"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
|
||
),
|
||
"brotli",
|
||
"gzip",
|
||
"lz4",
|
||
"zstd",
|
||
"snappy"
|
||
])
|
||
def test_compress_decompress(compression):
|
||
if not Codec.is_available(compression):
|
||
pytest.skip("{} support is not built".format(compression))
|
||
|
||
INPUT_SIZE = 10000
|
||
test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
|
||
.astype(np.uint8)
|
||
.tobytes())
|
||
test_buf = pa.py_buffer(test_data)
|
||
|
||
compressed_buf = pa.compress(test_buf, codec=compression)
|
||
compressed_bytes = pa.compress(test_data, codec=compression,
|
||
asbytes=True)
|
||
|
||
assert isinstance(compressed_bytes, bytes)
|
||
|
||
decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE,
|
||
codec=compression)
|
||
decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE,
|
||
codec=compression, asbytes=True)
|
||
|
||
assert isinstance(decompressed_bytes, bytes)
|
||
|
||
assert decompressed_buf.equals(test_buf)
|
||
assert decompressed_bytes == test_data
|
||
|
||
with pytest.raises(ValueError):
|
||
pa.decompress(compressed_bytes, codec=compression)
|
||
|
||
|
||
@pytest.mark.parametrize("compression", [
|
||
pytest.param(
|
||
"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
|
||
),
|
||
"brotli",
|
||
"gzip",
|
||
"lz4",
|
||
"zstd",
|
||
"snappy"
|
||
])
|
||
def test_compression_level(compression):
|
||
if not Codec.is_available(compression):
|
||
pytest.skip("{} support is not built".format(compression))
|
||
|
||
# These codecs do not support a compression level
|
||
no_level = ['snappy']
|
||
if compression in no_level:
|
||
assert not Codec.supports_compression_level(compression)
|
||
with pytest.raises(ValueError):
|
||
Codec(compression, 0)
|
||
with pytest.raises(ValueError):
|
||
Codec.minimum_compression_level(compression)
|
||
with pytest.raises(ValueError):
|
||
Codec.maximum_compression_level(compression)
|
||
with pytest.raises(ValueError):
|
||
Codec.default_compression_level(compression)
|
||
return
|
||
|
||
INPUT_SIZE = 10000
|
||
test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
|
||
.astype(np.uint8)
|
||
.tobytes())
|
||
test_buf = pa.py_buffer(test_data)
|
||
|
||
min_level = Codec.minimum_compression_level(compression)
|
||
max_level = Codec.maximum_compression_level(compression)
|
||
default_level = Codec.default_compression_level(compression)
|
||
|
||
assert min_level < max_level
|
||
assert default_level >= min_level
|
||
assert default_level <= max_level
|
||
|
||
for compression_level in range(min_level, max_level+1):
|
||
codec = Codec(compression, compression_level)
|
||
compressed_buf = codec.compress(test_buf)
|
||
compressed_bytes = codec.compress(test_data, asbytes=True)
|
||
assert isinstance(compressed_bytes, bytes)
|
||
decompressed_buf = codec.decompress(compressed_buf, INPUT_SIZE)
|
||
decompressed_bytes = codec.decompress(compressed_bytes, INPUT_SIZE,
|
||
asbytes=True)
|
||
|
||
assert isinstance(decompressed_bytes, bytes)
|
||
|
||
assert decompressed_buf.equals(test_buf)
|
||
assert decompressed_bytes == test_data
|
||
|
||
with pytest.raises(ValueError):
|
||
codec.decompress(compressed_bytes)
|
||
|
||
# The ability to set a seed this way is not present on older versions of
|
||
# numpy (currently in our python 3.6 CI build). Some inputs might just
|
||
# happen to compress the same between the two levels so using seeded
|
||
# random numbers is necessary to help get more reliable results
|
||
#
|
||
# The goal of this part is to ensure the compression_level is being
|
||
# passed down to the C++ layer, not to verify the compression algs
|
||
# themselves
|
||
if not hasattr(np.random, 'default_rng'):
|
||
pytest.skip('Requires newer version of numpy')
|
||
rng = np.random.default_rng(seed=42)
|
||
values = rng.integers(0, 100, 1000)
|
||
arr = pa.array(values)
|
||
hard_to_compress_buffer = arr.buffers()[1]
|
||
|
||
weak_codec = Codec(compression, min_level)
|
||
weakly_compressed_buf = weak_codec.compress(hard_to_compress_buffer)
|
||
|
||
strong_codec = Codec(compression, max_level)
|
||
strongly_compressed_buf = strong_codec.compress(hard_to_compress_buffer)
|
||
|
||
assert len(weakly_compressed_buf) > len(strongly_compressed_buf)
|
||
|
||
|
||
def test_buffer_memoryview_is_immutable():
|
||
val = b'some data'
|
||
|
||
buf = pa.py_buffer(val)
|
||
assert not buf.is_mutable
|
||
assert isinstance(buf, pa.Buffer)
|
||
|
||
result = memoryview(buf)
|
||
assert result.readonly
|
||
|
||
with pytest.raises(TypeError) as exc:
|
||
result[0] = b'h'
|
||
assert 'cannot modify read-only' in str(exc.value)
|
||
|
||
b = bytes(buf)
|
||
with pytest.raises(TypeError) as exc:
|
||
b[0] = b'h'
|
||
assert 'cannot modify read-only' in str(exc.value)
|
||
|
||
|
||
def test_uninitialized_buffer():
|
||
# ARROW-2039: calling Buffer() directly creates an uninitialized object
|
||
# ARROW-2638: prevent calling extension class constructors directly
|
||
with pytest.raises(TypeError):
|
||
pa.Buffer()
|
||
|
||
|
||
def test_memory_output_stream():
|
||
# 10 bytes
|
||
val = b'dataabcdef'
|
||
f = pa.BufferOutputStream()
|
||
|
||
K = 1000
|
||
for i in range(K):
|
||
f.write(val)
|
||
|
||
buf = f.getvalue()
|
||
assert len(buf) == len(val) * K
|
||
assert buf.to_pybytes() == val * K
|
||
|
||
|
||
def test_inmemory_write_after_closed():
|
||
f = pa.BufferOutputStream()
|
||
f.write(b'ok')
|
||
assert not f.closed
|
||
f.getvalue()
|
||
assert f.closed
|
||
|
||
with pytest.raises(ValueError):
|
||
f.write(b'not ok')
|
||
|
||
|
||
def test_buffer_protocol_ref_counting():
|
||
def make_buffer(bytes_obj):
|
||
return bytearray(pa.py_buffer(bytes_obj))
|
||
|
||
buf = make_buffer(b'foo')
|
||
gc.collect()
|
||
assert buf == b'foo'
|
||
|
||
# ARROW-1053
|
||
val = b'foo'
|
||
refcount_before = sys.getrefcount(val)
|
||
for i in range(10):
|
||
make_buffer(val)
|
||
gc.collect()
|
||
assert refcount_before == sys.getrefcount(val)
|
||
|
||
|
||
def test_nativefile_write_memoryview():
|
||
f = pa.BufferOutputStream()
|
||
data = b'ok'
|
||
|
||
arr = np.frombuffer(data, dtype='S1')
|
||
|
||
f.write(arr)
|
||
f.write(bytearray(data))
|
||
f.write(pa.py_buffer(data))
|
||
with pytest.raises(TypeError):
|
||
f.write(data.decode('utf8'))
|
||
|
||
buf = f.getvalue()
|
||
|
||
assert buf.to_pybytes() == data * 3
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Mock output stream
|
||
|
||
|
||
def test_mock_output_stream():
|
||
# Make sure that the MockOutputStream and the BufferOutputStream record the
|
||
# same size
|
||
|
||
# 10 bytes
|
||
val = b'dataabcdef'
|
||
|
||
f1 = pa.MockOutputStream()
|
||
f2 = pa.BufferOutputStream()
|
||
|
||
K = 1000
|
||
for i in range(K):
|
||
f1.write(val)
|
||
f2.write(val)
|
||
|
||
assert f1.size() == len(f2.getvalue())
|
||
|
||
# Do the same test with a table
|
||
record_batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], ['a'])
|
||
|
||
f1 = pa.MockOutputStream()
|
||
f2 = pa.BufferOutputStream()
|
||
|
||
stream_writer1 = pa.RecordBatchStreamWriter(f1, record_batch.schema)
|
||
stream_writer2 = pa.RecordBatchStreamWriter(f2, record_batch.schema)
|
||
|
||
stream_writer1.write_batch(record_batch)
|
||
stream_writer2.write_batch(record_batch)
|
||
stream_writer1.close()
|
||
stream_writer2.close()
|
||
|
||
assert f1.size() == len(f2.getvalue())
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# OS files and memory maps
|
||
|
||
|
||
@pytest.fixture
|
||
def sample_disk_data(request, tmpdir):
|
||
SIZE = 4096
|
||
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
|
||
data = arr.tobytes()[:SIZE]
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
|
||
with open(path, 'wb') as f:
|
||
f.write(data)
|
||
|
||
def teardown():
|
||
_try_delete(path)
|
||
|
||
request.addfinalizer(teardown)
|
||
return path, data
|
||
|
||
|
||
def _check_native_file_reader(FACTORY, sample_data,
|
||
allow_read_out_of_bounds=True):
|
||
path, data = sample_data
|
||
|
||
f = FACTORY(path, mode='r')
|
||
|
||
assert f.read(10) == data[:10]
|
||
assert f.read(0) == b''
|
||
assert f.tell() == 10
|
||
|
||
assert f.read() == data[10:]
|
||
|
||
assert f.size() == len(data)
|
||
|
||
f.seek(0)
|
||
assert f.tell() == 0
|
||
|
||
# Seeking past end of file not supported in memory maps
|
||
if allow_read_out_of_bounds:
|
||
f.seek(len(data) + 1)
|
||
assert f.tell() == len(data) + 1
|
||
assert f.read(5) == b''
|
||
|
||
# Test whence argument of seek, ARROW-1287
|
||
assert f.seek(3) == 3
|
||
assert f.seek(3, os.SEEK_CUR) == 6
|
||
assert f.tell() == 6
|
||
|
||
ex_length = len(data) - 2
|
||
assert f.seek(-2, os.SEEK_END) == ex_length
|
||
assert f.tell() == ex_length
|
||
|
||
|
||
def test_memory_map_reader(sample_disk_data):
|
||
_check_native_file_reader(pa.memory_map, sample_disk_data,
|
||
allow_read_out_of_bounds=False)
|
||
|
||
|
||
def test_memory_map_retain_buffer_reference(sample_disk_data):
|
||
path, data = sample_disk_data
|
||
|
||
cases = []
|
||
with pa.memory_map(path, 'rb') as f:
|
||
cases.append((f.read_buffer(100), data[:100]))
|
||
cases.append((f.read_buffer(100), data[100:200]))
|
||
cases.append((f.read_buffer(100), data[200:300]))
|
||
|
||
# Call gc.collect() for good measure
|
||
gc.collect()
|
||
|
||
for buf, expected in cases:
|
||
assert buf.to_pybytes() == expected
|
||
|
||
|
||
def test_os_file_reader(sample_disk_data):
|
||
_check_native_file_reader(pa.OSFile, sample_disk_data)
|
||
|
||
|
||
def test_os_file_large_seeks():
|
||
check_large_seeks(pa.OSFile)
|
||
|
||
|
||
def _try_delete(path):
|
||
try:
|
||
os.remove(path)
|
||
except os.error:
|
||
pass
|
||
|
||
|
||
def test_memory_map_writer(tmpdir):
|
||
SIZE = 4096
|
||
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
|
||
data = arr.tobytes()[:SIZE]
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with open(path, 'wb') as f:
|
||
f.write(data)
|
||
|
||
f = pa.memory_map(path, mode='r+b')
|
||
|
||
f.seek(10)
|
||
f.write(b'peekaboo')
|
||
assert f.tell() == 18
|
||
|
||
f.seek(10)
|
||
assert f.read(8) == b'peekaboo'
|
||
|
||
f2 = pa.memory_map(path, mode='r+b')
|
||
|
||
f2.seek(10)
|
||
f2.write(b'booapeak')
|
||
f2.seek(10)
|
||
|
||
f.seek(10)
|
||
assert f.read(8) == b'booapeak'
|
||
|
||
# Does not truncate file
|
||
f3 = pa.memory_map(path, mode='w')
|
||
f3.write(b'foo')
|
||
|
||
with pa.memory_map(path) as f4:
|
||
assert f4.size() == SIZE
|
||
|
||
with pytest.raises(IOError):
|
||
f3.read(5)
|
||
|
||
f.seek(0)
|
||
assert f.read(3) == b'foo'
|
||
|
||
|
||
def test_memory_map_resize(tmpdir):
|
||
SIZE = 4096
|
||
arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
|
||
data1 = arr.tobytes()[:(SIZE // 2)]
|
||
data2 = arr.tobytes()[(SIZE // 2):]
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
|
||
mmap = pa.create_memory_map(path, SIZE / 2)
|
||
mmap.write(data1)
|
||
|
||
mmap.resize(SIZE)
|
||
mmap.write(data2)
|
||
|
||
mmap.close()
|
||
|
||
with open(path, 'rb') as f:
|
||
assert f.read() == arr.tobytes()
|
||
|
||
|
||
def test_memory_zero_length(tmpdir):
|
||
path = os.path.join(str(tmpdir), guid())
|
||
f = open(path, 'wb')
|
||
f.close()
|
||
with pa.memory_map(path, mode='r+b') as memory_map:
|
||
assert memory_map.size() == 0
|
||
|
||
|
||
def test_memory_map_large_seeks():
|
||
check_large_seeks(pa.memory_map)
|
||
|
||
|
||
def test_memory_map_close_remove(tmpdir):
|
||
# ARROW-6740: should be able to delete closed memory-mapped file (Windows)
|
||
path = os.path.join(str(tmpdir), guid())
|
||
mmap = pa.create_memory_map(path, 4096)
|
||
mmap.close()
|
||
assert mmap.closed
|
||
os.remove(path) # Shouldn't fail
|
||
|
||
|
||
def test_memory_map_deref_remove(tmpdir):
|
||
path = os.path.join(str(tmpdir), guid())
|
||
pa.create_memory_map(path, 4096)
|
||
os.remove(path) # Shouldn't fail
|
||
|
||
|
||
def test_os_file_writer(tmpdir):
|
||
SIZE = 4096
|
||
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
|
||
data = arr.tobytes()[:SIZE]
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with open(path, 'wb') as f:
|
||
f.write(data)
|
||
|
||
# Truncates file
|
||
f2 = pa.OSFile(path, mode='w')
|
||
f2.write(b'foo')
|
||
|
||
with pa.OSFile(path) as f3:
|
||
assert f3.size() == 3
|
||
|
||
with pytest.raises(IOError):
|
||
f2.read(5)
|
||
|
||
|
||
def test_native_file_write_reject_unicode():
|
||
# ARROW-3227
|
||
nf = pa.BufferOutputStream()
|
||
with pytest.raises(TypeError):
|
||
nf.write('foo')
|
||
|
||
|
||
def test_native_file_modes(tmpdir):
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with open(path, 'wb') as f:
|
||
f.write(b'foooo')
|
||
|
||
with pa.OSFile(path, mode='r') as f:
|
||
assert f.mode == 'rb'
|
||
assert f.readable()
|
||
assert not f.writable()
|
||
assert f.seekable()
|
||
|
||
with pa.OSFile(path, mode='rb') as f:
|
||
assert f.mode == 'rb'
|
||
assert f.readable()
|
||
assert not f.writable()
|
||
assert f.seekable()
|
||
|
||
with pa.OSFile(path, mode='w') as f:
|
||
assert f.mode == 'wb'
|
||
assert not f.readable()
|
||
assert f.writable()
|
||
assert not f.seekable()
|
||
|
||
with pa.OSFile(path, mode='wb') as f:
|
||
assert f.mode == 'wb'
|
||
assert not f.readable()
|
||
assert f.writable()
|
||
assert not f.seekable()
|
||
|
||
with open(path, 'wb') as f:
|
||
f.write(b'foooo')
|
||
|
||
with pa.memory_map(path, 'r') as f:
|
||
assert f.mode == 'rb'
|
||
assert f.readable()
|
||
assert not f.writable()
|
||
assert f.seekable()
|
||
|
||
with pa.memory_map(path, 'r+') as f:
|
||
assert f.mode == 'rb+'
|
||
assert f.readable()
|
||
assert f.writable()
|
||
assert f.seekable()
|
||
|
||
with pa.memory_map(path, 'r+b') as f:
|
||
assert f.mode == 'rb+'
|
||
assert f.readable()
|
||
assert f.writable()
|
||
assert f.seekable()
|
||
|
||
|
||
def test_native_file_permissions(tmpdir):
|
||
# ARROW-10124: permissions of created files should follow umask
|
||
cur_umask = os.umask(0o002)
|
||
os.umask(cur_umask)
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with pa.OSFile(path, mode='w'):
|
||
pass
|
||
assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with pa.memory_map(path, 'w'):
|
||
pass
|
||
assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
|
||
|
||
|
||
def test_native_file_raises_ValueError_after_close(tmpdir):
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with open(path, 'wb') as f:
|
||
f.write(b'foooo')
|
||
|
||
with pa.OSFile(path, mode='rb') as os_file:
|
||
assert not os_file.closed
|
||
assert os_file.closed
|
||
|
||
with pa.memory_map(path, mode='rb') as mmap_file:
|
||
assert not mmap_file.closed
|
||
assert mmap_file.closed
|
||
|
||
files = [os_file,
|
||
mmap_file]
|
||
|
||
methods = [('tell', ()),
|
||
('seek', (0,)),
|
||
('size', ()),
|
||
('flush', ()),
|
||
('readable', ()),
|
||
('writable', ()),
|
||
('seekable', ())]
|
||
|
||
for f in files:
|
||
for method, args in methods:
|
||
with pytest.raises(ValueError):
|
||
getattr(f, method)(*args)
|
||
|
||
|
||
def test_native_file_TextIOWrapper(tmpdir):
|
||
data = ('foooo\n'
|
||
'barrr\n'
|
||
'bazzz\n')
|
||
|
||
path = os.path.join(str(tmpdir), guid())
|
||
with open(path, 'wb') as f:
|
||
f.write(data.encode('utf-8'))
|
||
|
||
with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
|
||
assert fil.readable()
|
||
res = fil.read()
|
||
assert res == data
|
||
assert fil.closed
|
||
|
||
with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
|
||
# Iteration works
|
||
lines = list(fil)
|
||
assert ''.join(lines) == data
|
||
|
||
# Writing
|
||
path2 = os.path.join(str(tmpdir), guid())
|
||
with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil:
|
||
assert fil.writable()
|
||
fil.write(data)
|
||
|
||
with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil:
|
||
res = fil.read()
|
||
assert res == data
|
||
|
||
|
||
def test_native_file_open_error():
|
||
with assert_file_not_found():
|
||
pa.OSFile('non_existent_file', 'rb')
|
||
with assert_file_not_found():
|
||
pa.memory_map('non_existent_file', 'rb')
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Buffered streams
|
||
|
||
def test_buffered_input_stream():
|
||
raw = pa.BufferReader(b"123456789")
|
||
f = pa.BufferedInputStream(raw, buffer_size=4)
|
||
assert f.read(2) == b"12"
|
||
assert raw.tell() == 4
|
||
f.close()
|
||
assert f.closed
|
||
assert raw.closed
|
||
|
||
|
||
def test_buffered_input_stream_detach_seekable():
|
||
# detach() to a seekable file (io::RandomAccessFile in C++)
|
||
f = pa.BufferedInputStream(pa.BufferReader(b"123456789"), buffer_size=4)
|
||
assert f.read(2) == b"12"
|
||
raw = f.detach()
|
||
assert f.closed
|
||
assert not raw.closed
|
||
assert raw.seekable()
|
||
assert raw.read(4) == b"5678"
|
||
raw.seek(2)
|
||
assert raw.read(4) == b"3456"
|
||
|
||
|
||
def test_buffered_input_stream_detach_non_seekable():
|
||
# detach() to a non-seekable file (io::InputStream in C++)
|
||
f = pa.BufferedInputStream(
|
||
pa.BufferedInputStream(pa.BufferReader(b"123456789"), buffer_size=4),
|
||
buffer_size=4)
|
||
assert f.read(2) == b"12"
|
||
raw = f.detach()
|
||
assert f.closed
|
||
assert not raw.closed
|
||
assert not raw.seekable()
|
||
assert raw.read(4) == b"5678"
|
||
with pytest.raises(EnvironmentError):
|
||
raw.seek(2)
|
||
|
||
|
||
def test_buffered_output_stream():
|
||
np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
|
||
buf = pa.py_buffer(np_buf)
|
||
|
||
raw = pa.FixedSizeBufferWriter(buf)
|
||
f = pa.BufferedOutputStream(raw, buffer_size=4)
|
||
f.write(b"12")
|
||
assert np_buf[:4].tobytes() == b'\0\0\0\0'
|
||
f.flush()
|
||
assert np_buf[:4].tobytes() == b'12\0\0'
|
||
f.write(b"3456789")
|
||
f.close()
|
||
assert f.closed
|
||
assert raw.closed
|
||
assert np_buf[:10].tobytes() == b'123456789\0'
|
||
|
||
|
||
def test_buffered_output_stream_detach():
|
||
np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
|
||
buf = pa.py_buffer(np_buf)
|
||
|
||
f = pa.BufferedOutputStream(pa.FixedSizeBufferWriter(buf), buffer_size=4)
|
||
f.write(b"12")
|
||
assert np_buf[:4].tobytes() == b'\0\0\0\0'
|
||
raw = f.detach()
|
||
assert f.closed
|
||
assert not raw.closed
|
||
assert np_buf[:4].tobytes() == b'12\0\0'
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Compressed input and output streams
|
||
|
||
def check_compressed_input(data, fn, compression):
|
||
raw = pa.OSFile(fn, mode="rb")
|
||
with pa.CompressedInputStream(raw, compression) as compressed:
|
||
assert not compressed.closed
|
||
assert compressed.readable()
|
||
assert not compressed.writable()
|
||
assert not compressed.seekable()
|
||
got = compressed.read()
|
||
assert got == data
|
||
assert compressed.closed
|
||
assert raw.closed
|
||
|
||
# Same with read_buffer()
|
||
raw = pa.OSFile(fn, mode="rb")
|
||
with pa.CompressedInputStream(raw, compression) as compressed:
|
||
buf = compressed.read_buffer()
|
||
assert isinstance(buf, pa.Buffer)
|
||
assert buf.to_pybytes() == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_compressed_input_gzip(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
fn = str(tmpdir / "compressed_input_test.gz")
|
||
with gzip.open(fn, "wb") as f:
|
||
f.write(data)
|
||
check_compressed_input(data, fn, "gzip")
|
||
|
||
|
||
def test_compressed_input_bz2(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
fn = str(tmpdir / "compressed_input_test.bz2")
|
||
with bz2.BZ2File(fn, "w") as f:
|
||
f.write(data)
|
||
try:
|
||
check_compressed_input(data, fn, "bz2")
|
||
except NotImplementedError as e:
|
||
pytest.skip(str(e))
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_compressed_input_openfile(tmpdir):
|
||
if not Codec.is_available("gzip"):
|
||
pytest.skip("gzip support is not built")
|
||
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
fn = str(tmpdir / "test_compressed_input_openfile.gz")
|
||
with gzip.open(fn, "wb") as f:
|
||
f.write(data)
|
||
|
||
with pa.CompressedInputStream(fn, "gzip") as compressed:
|
||
buf = compressed.read_buffer()
|
||
assert buf.to_pybytes() == data
|
||
assert compressed.closed
|
||
|
||
with pa.CompressedInputStream(pathlib.Path(fn), "gzip") as compressed:
|
||
buf = compressed.read_buffer()
|
||
assert buf.to_pybytes() == data
|
||
assert compressed.closed
|
||
|
||
f = open(fn, "rb")
|
||
with pa.CompressedInputStream(f, "gzip") as compressed:
|
||
buf = compressed.read_buffer()
|
||
assert buf.to_pybytes() == data
|
||
assert f.closed
|
||
|
||
|
||
def check_compressed_concatenated(data, fn, compression):
|
||
raw = pa.OSFile(fn, mode="rb")
|
||
with pa.CompressedInputStream(raw, compression) as compressed:
|
||
got = compressed.read()
|
||
assert got == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_compressed_concatenated_gzip(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
fn = str(tmpdir / "compressed_input_test2.gz")
|
||
with gzip.open(fn, "wb") as f:
|
||
f.write(data[:50])
|
||
with gzip.open(fn, "ab") as f:
|
||
f.write(data[50:])
|
||
check_compressed_concatenated(data, fn, "gzip")
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_compressed_input_invalid():
|
||
data = b"foo" * 10
|
||
raw = pa.BufferReader(data)
|
||
with pytest.raises(ValueError):
|
||
pa.CompressedInputStream(raw, "unknown_compression")
|
||
with pytest.raises(TypeError):
|
||
pa.CompressedInputStream(raw, None)
|
||
|
||
with pa.CompressedInputStream(raw, "gzip") as compressed:
|
||
with pytest.raises(IOError, match="zlib inflate failed"):
|
||
compressed.read()
|
||
|
||
|
||
def make_compressed_output(data, fn, compression):
|
||
raw = pa.BufferOutputStream()
|
||
with pa.CompressedOutputStream(raw, compression) as compressed:
|
||
assert not compressed.closed
|
||
assert not compressed.readable()
|
||
assert compressed.writable()
|
||
assert not compressed.seekable()
|
||
compressed.write(data)
|
||
assert compressed.closed
|
||
assert raw.closed
|
||
with open(fn, "wb") as f:
|
||
f.write(raw.getvalue())
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_compressed_output_gzip(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
fn = str(tmpdir / "compressed_output_test.gz")
|
||
make_compressed_output(data, fn, "gzip")
|
||
with gzip.open(fn, "rb") as f:
|
||
got = f.read()
|
||
assert got == data
|
||
|
||
|
||
def test_compressed_output_bz2(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
fn = str(tmpdir / "compressed_output_test.bz2")
|
||
try:
|
||
make_compressed_output(data, fn, "bz2")
|
||
except NotImplementedError as e:
|
||
pytest.skip(str(e))
|
||
with bz2.BZ2File(fn, "r") as f:
|
||
got = f.read()
|
||
assert got == data
|
||
|
||
|
||
def test_output_stream_constructor(tmpdir):
|
||
if not Codec.is_available("gzip"):
|
||
pytest.skip("gzip support is not built")
|
||
with pa.CompressedOutputStream(tmpdir / "ctor.gz", "gzip") as stream:
|
||
stream.write(b"test")
|
||
with (tmpdir / "ctor2.gz").open("wb") as f:
|
||
with pa.CompressedOutputStream(f, "gzip") as stream:
|
||
stream.write(b"test")
|
||
|
||
|
||
@pytest.mark.parametrize(("path", "expected_compression"), [
|
||
("file.bz2", "bz2"),
|
||
("file.lz4", "lz4"),
|
||
(pathlib.Path("file.gz"), "gzip"),
|
||
(pathlib.Path("path/to/file.zst"), "zstd"),
|
||
])
|
||
def test_compression_detection(path, expected_compression):
|
||
if not Codec.is_available(expected_compression):
|
||
with pytest.raises(pa.lib.ArrowNotImplementedError):
|
||
Codec.detect(path)
|
||
else:
|
||
codec = Codec.detect(path)
|
||
assert isinstance(codec, Codec)
|
||
assert codec.name == expected_compression
|
||
|
||
|
||
def test_unknown_compression_raises():
|
||
with pytest.raises(ValueError):
|
||
Codec.is_available('unknown')
|
||
with pytest.raises(TypeError):
|
||
Codec(None)
|
||
with pytest.raises(ValueError):
|
||
Codec('unknown')
|
||
|
||
|
||
@pytest.mark.parametrize("compression", [
|
||
"bz2",
|
||
"brotli",
|
||
"gzip",
|
||
"lz4",
|
||
"zstd",
|
||
pytest.param(
|
||
"snappy",
|
||
marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
|
||
)
|
||
])
|
||
def test_compressed_roundtrip(compression):
|
||
if not Codec.is_available(compression):
|
||
pytest.skip("{} support is not built".format(compression))
|
||
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
raw = pa.BufferOutputStream()
|
||
with pa.CompressedOutputStream(raw, compression) as compressed:
|
||
compressed.write(data)
|
||
|
||
cdata = raw.getvalue()
|
||
assert len(cdata) < len(data)
|
||
raw = pa.BufferReader(cdata)
|
||
with pa.CompressedInputStream(raw, compression) as compressed:
|
||
got = compressed.read()
|
||
assert got == data
|
||
|
||
|
||
@pytest.mark.parametrize(
|
||
"compression",
|
||
["bz2", "brotli", "gzip", "lz4", "zstd"]
|
||
)
|
||
def test_compressed_recordbatch_stream(compression):
|
||
if not Codec.is_available(compression):
|
||
pytest.skip("{} support is not built".format(compression))
|
||
|
||
# ARROW-4836: roundtrip a RecordBatch through a compressed stream
|
||
table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
|
||
raw = pa.BufferOutputStream()
|
||
stream = pa.CompressedOutputStream(raw, compression)
|
||
writer = pa.RecordBatchStreamWriter(stream, table.schema)
|
||
writer.write_table(table, max_chunksize=3)
|
||
writer.close()
|
||
stream.close() # Flush data
|
||
buf = raw.getvalue()
|
||
stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
|
||
got_table = pa.RecordBatchStreamReader(stream).read_all()
|
||
assert got_table == table
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# Transform input streams
|
||
|
||
unicode_transcoding_example = (
|
||
"Dès Noël où un zéphyr haï me vêt de glaçons würmiens "
|
||
"je dîne d’exquis rôtis de bœuf au kir à l’aÿ d’âge mûr & cætera !"
|
||
)
|
||
|
||
|
||
def check_transcoding(data, src_encoding, dest_encoding, chunk_sizes):
|
||
chunk_sizes = iter(chunk_sizes)
|
||
stream = pa.transcoding_input_stream(
|
||
pa.BufferReader(data.encode(src_encoding)),
|
||
src_encoding, dest_encoding)
|
||
out = []
|
||
while True:
|
||
buf = stream.read(next(chunk_sizes))
|
||
out.append(buf)
|
||
if not buf:
|
||
break
|
||
out = b''.join(out)
|
||
assert out.decode(dest_encoding) == data
|
||
|
||
|
||
@pytest.mark.parametrize('src_encoding, dest_encoding',
|
||
[('utf-8', 'utf-16'),
|
||
('utf-16', 'utf-8'),
|
||
('utf-8', 'utf-32-le'),
|
||
('utf-8', 'utf-32-be'),
|
||
])
|
||
def test_transcoding_input_stream(src_encoding, dest_encoding):
|
||
# All at once
|
||
check_transcoding(unicode_transcoding_example,
|
||
src_encoding, dest_encoding, [1000, 0])
|
||
# Incremental
|
||
check_transcoding(unicode_transcoding_example,
|
||
src_encoding, dest_encoding,
|
||
itertools.cycle([1, 2, 3, 5]))
|
||
|
||
|
||
@pytest.mark.parametrize('src_encoding, dest_encoding',
|
||
[('utf-8', 'utf-8'),
|
||
('utf-8', 'UTF8')])
|
||
def test_transcoding_no_ops(src_encoding, dest_encoding):
|
||
# No indirection is wasted when a trivial transcoding is requested
|
||
stream = pa.BufferReader(b"abc123")
|
||
assert pa.transcoding_input_stream(
|
||
stream, src_encoding, dest_encoding) is stream
|
||
|
||
|
||
@pytest.mark.parametrize('src_encoding, dest_encoding',
|
||
[('utf-8', 'ascii'),
|
||
('utf-8', 'latin-1'),
|
||
])
|
||
def test_transcoding_encoding_error(src_encoding, dest_encoding):
|
||
# Character \u0100 cannot be represented in the destination encoding
|
||
stream = pa.transcoding_input_stream(
|
||
pa.BufferReader("\u0100".encode(src_encoding)),
|
||
src_encoding,
|
||
dest_encoding)
|
||
with pytest.raises(UnicodeEncodeError):
|
||
stream.read(1)
|
||
|
||
|
||
@pytest.mark.parametrize('src_encoding, dest_encoding',
|
||
[('utf-8', 'utf-16'),
|
||
('utf-16', 'utf-8'),
|
||
])
|
||
def test_transcoding_decoding_error(src_encoding, dest_encoding):
|
||
# The given bytestring is not valid in the source encoding
|
||
stream = pa.transcoding_input_stream(
|
||
pa.BufferReader(b"\xff\xff\xff\xff"),
|
||
src_encoding,
|
||
dest_encoding)
|
||
with pytest.raises(UnicodeError):
|
||
stream.read(1)
|
||
|
||
|
||
# ----------------------------------------------------------------------
|
||
# High-level API
|
||
|
||
@pytest.mark.gzip
|
||
def test_input_stream_buffer():
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
for arg in [pa.py_buffer(data), memoryview(data)]:
|
||
stream = pa.input_stream(arg)
|
||
assert stream.read() == data
|
||
|
||
gz_data = gzip.compress(data)
|
||
stream = pa.input_stream(memoryview(gz_data))
|
||
assert stream.read() == gz_data
|
||
stream = pa.input_stream(memoryview(gz_data), compression='gzip')
|
||
assert stream.read() == data
|
||
|
||
|
||
def test_input_stream_duck_typing():
|
||
# Accept objects having the right file-like methods...
|
||
class DuckReader:
|
||
|
||
def close(self):
|
||
pass
|
||
|
||
@property
|
||
def closed(self):
|
||
return False
|
||
|
||
def read(self, nbytes=None):
|
||
return b'hello'
|
||
|
||
stream = pa.input_stream(DuckReader())
|
||
assert stream.read(5) == b'hello'
|
||
|
||
|
||
def test_input_stream_file_path(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
file_path = tmpdir / 'input_stream'
|
||
with open(str(file_path), 'wb') as f:
|
||
f.write(data)
|
||
|
||
stream = pa.input_stream(file_path)
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(str(file_path))
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(pathlib.Path(str(file_path)))
|
||
assert stream.read() == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_input_stream_file_path_compressed(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
gz_data = gzip.compress(data)
|
||
file_path = tmpdir / 'input_stream.gz'
|
||
with open(str(file_path), 'wb') as f:
|
||
f.write(gz_data)
|
||
|
||
stream = pa.input_stream(file_path)
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(str(file_path))
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(pathlib.Path(str(file_path)))
|
||
assert stream.read() == data
|
||
|
||
stream = pa.input_stream(file_path, compression='gzip')
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(file_path, compression=None)
|
||
assert stream.read() == gz_data
|
||
|
||
|
||
def test_input_stream_file_path_buffered(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
file_path = tmpdir / 'input_stream.buffered'
|
||
with open(str(file_path), 'wb') as f:
|
||
f.write(data)
|
||
|
||
stream = pa.input_stream(file_path, buffer_size=32)
|
||
assert isinstance(stream, pa.BufferedInputStream)
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(str(file_path), buffer_size=64)
|
||
assert isinstance(stream, pa.BufferedInputStream)
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
|
||
assert isinstance(stream, pa.BufferedInputStream)
|
||
assert stream.read() == data
|
||
|
||
unbuffered_stream = pa.input_stream(file_path, buffer_size=0)
|
||
assert isinstance(unbuffered_stream, pa.OSFile)
|
||
|
||
msg = 'Buffer size must be larger than zero'
|
||
with pytest.raises(ValueError, match=msg):
|
||
pa.input_stream(file_path, buffer_size=-1)
|
||
with pytest.raises(TypeError):
|
||
pa.input_stream(file_path, buffer_size='million')
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_input_stream_file_path_compressed_and_buffered(tmpdir):
|
||
data = b"some test data\n" * 100 + b"eof\n"
|
||
gz_data = gzip.compress(data)
|
||
file_path = tmpdir / 'input_stream_compressed_and_buffered.gz'
|
||
with open(str(file_path), 'wb') as f:
|
||
f.write(gz_data)
|
||
|
||
stream = pa.input_stream(file_path, buffer_size=32, compression='gzip')
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(str(file_path), buffer_size=64)
|
||
assert stream.read() == data
|
||
stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
|
||
assert stream.read() == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_input_stream_python_file(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
bio = BytesIO(data)
|
||
|
||
stream = pa.input_stream(bio)
|
||
assert stream.read() == data
|
||
|
||
gz_data = gzip.compress(data)
|
||
bio = BytesIO(gz_data)
|
||
stream = pa.input_stream(bio)
|
||
assert stream.read() == gz_data
|
||
bio.seek(0)
|
||
stream = pa.input_stream(bio, compression='gzip')
|
||
assert stream.read() == data
|
||
|
||
file_path = tmpdir / 'input_stream'
|
||
with open(str(file_path), 'wb') as f:
|
||
f.write(data)
|
||
with open(str(file_path), 'rb') as f:
|
||
stream = pa.input_stream(f)
|
||
assert stream.read() == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_input_stream_native_file():
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
gz_data = gzip.compress(data)
|
||
reader = pa.BufferReader(gz_data)
|
||
stream = pa.input_stream(reader)
|
||
assert stream is reader
|
||
reader = pa.BufferReader(gz_data)
|
||
stream = pa.input_stream(reader, compression='gzip')
|
||
assert stream.read() == data
|
||
|
||
|
||
def test_input_stream_errors(tmpdir):
|
||
buf = memoryview(b"")
|
||
with pytest.raises(ValueError):
|
||
pa.input_stream(buf, compression="foo")
|
||
|
||
for arg in [bytearray(), StringIO()]:
|
||
with pytest.raises(TypeError):
|
||
pa.input_stream(arg)
|
||
|
||
with assert_file_not_found():
|
||
pa.input_stream("non_existent_file")
|
||
|
||
with open(str(tmpdir / 'new_file'), 'wb') as f:
|
||
with pytest.raises(TypeError, match="readable file expected"):
|
||
pa.input_stream(f)
|
||
|
||
|
||
def test_output_stream_buffer():
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
buf = bytearray(len(data))
|
||
stream = pa.output_stream(pa.py_buffer(buf))
|
||
stream.write(data)
|
||
assert buf == data
|
||
|
||
buf = bytearray(len(data))
|
||
stream = pa.output_stream(memoryview(buf))
|
||
stream.write(data)
|
||
assert buf == data
|
||
|
||
|
||
def test_output_stream_duck_typing():
|
||
# Accept objects having the right file-like methods...
|
||
class DuckWriter:
|
||
def __init__(self):
|
||
self.buf = pa.BufferOutputStream()
|
||
|
||
def close(self):
|
||
pass
|
||
|
||
@property
|
||
def closed(self):
|
||
return False
|
||
|
||
def write(self, data):
|
||
self.buf.write(data)
|
||
|
||
duck_writer = DuckWriter()
|
||
stream = pa.output_stream(duck_writer)
|
||
assert stream.write(b'hello')
|
||
assert duck_writer.buf.getvalue().to_pybytes() == b'hello'
|
||
|
||
|
||
def test_output_stream_file_path(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
file_path = tmpdir / 'output_stream'
|
||
|
||
def check_data(file_path, data):
|
||
with pa.output_stream(file_path) as stream:
|
||
stream.write(data)
|
||
with open(str(file_path), 'rb') as f:
|
||
assert f.read() == data
|
||
|
||
check_data(file_path, data)
|
||
check_data(str(file_path), data)
|
||
check_data(pathlib.Path(str(file_path)), data)
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_output_stream_file_path_compressed(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
file_path = tmpdir / 'output_stream.gz'
|
||
|
||
def check_data(file_path, data, **kwargs):
|
||
with pa.output_stream(file_path, **kwargs) as stream:
|
||
stream.write(data)
|
||
with open(str(file_path), 'rb') as f:
|
||
return f.read()
|
||
|
||
assert gzip.decompress(check_data(file_path, data)) == data
|
||
assert gzip.decompress(check_data(str(file_path), data)) == data
|
||
assert gzip.decompress(
|
||
check_data(pathlib.Path(str(file_path)), data)) == data
|
||
|
||
assert gzip.decompress(
|
||
check_data(file_path, data, compression='gzip')) == data
|
||
assert check_data(file_path, data, compression=None) == data
|
||
|
||
with pytest.raises(ValueError, match='Invalid value for compression'):
|
||
assert check_data(file_path, data, compression='rabbit') == data
|
||
|
||
|
||
def test_output_stream_file_path_buffered(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
file_path = tmpdir / 'output_stream.buffered'
|
||
|
||
def check_data(file_path, data, **kwargs):
|
||
with pa.output_stream(file_path, **kwargs) as stream:
|
||
if kwargs.get('buffer_size', 0) > 0:
|
||
assert isinstance(stream, pa.BufferedOutputStream)
|
||
stream.write(data)
|
||
with open(str(file_path), 'rb') as f:
|
||
return f.read()
|
||
|
||
unbuffered_stream = pa.output_stream(file_path, buffer_size=0)
|
||
assert isinstance(unbuffered_stream, pa.OSFile)
|
||
|
||
msg = 'Buffer size must be larger than zero'
|
||
with pytest.raises(ValueError, match=msg):
|
||
assert check_data(file_path, data, buffer_size=-128) == data
|
||
|
||
assert check_data(file_path, data, buffer_size=32) == data
|
||
assert check_data(file_path, data, buffer_size=1024) == data
|
||
assert check_data(str(file_path), data, buffer_size=32) == data
|
||
|
||
result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32)
|
||
assert result == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_output_stream_file_path_compressed_and_buffered(tmpdir):
|
||
data = b"some test data\n" * 100 + b"eof\n"
|
||
file_path = tmpdir / 'output_stream_compressed_and_buffered.gz'
|
||
|
||
def check_data(file_path, data, **kwargs):
|
||
with pa.output_stream(file_path, **kwargs) as stream:
|
||
stream.write(data)
|
||
with open(str(file_path), 'rb') as f:
|
||
return f.read()
|
||
|
||
result = check_data(file_path, data, buffer_size=32)
|
||
assert gzip.decompress(result) == data
|
||
|
||
result = check_data(file_path, data, buffer_size=1024)
|
||
assert gzip.decompress(result) == data
|
||
|
||
result = check_data(file_path, data, buffer_size=1024, compression='gzip')
|
||
assert gzip.decompress(result) == data
|
||
|
||
|
||
def test_output_stream_destructor(tmpdir):
|
||
# The wrapper returned by pa.output_stream() should respect Python
|
||
# file semantics, i.e. destroying it should close the underlying
|
||
# file cleanly.
|
||
data = b"some test data\n"
|
||
file_path = tmpdir / 'output_stream.buffered'
|
||
|
||
def check_data(file_path, data, **kwargs):
|
||
stream = pa.output_stream(file_path, **kwargs)
|
||
stream.write(data)
|
||
del stream
|
||
gc.collect()
|
||
with open(str(file_path), 'rb') as f:
|
||
return f.read()
|
||
|
||
assert check_data(file_path, data, buffer_size=0) == data
|
||
assert check_data(file_path, data, buffer_size=1024) == data
|
||
|
||
|
||
@pytest.mark.gzip
|
||
def test_output_stream_python_file(tmpdir):
|
||
data = b"some test data\n" * 10 + b"eof\n"
|
||
|
||
def check_data(data, **kwargs):
|
||
# XXX cannot use BytesIO because stream.close() is necessary
|
||
# to finish writing compressed data, but it will also close the
|
||
# underlying BytesIO
|
||
fn = str(tmpdir / 'output_stream_file')
|
||
with open(fn, 'wb') as f:
|
||
with pa.output_stream(f, **kwargs) as stream:
|
||
stream.write(data)
|
||
with open(fn, 'rb') as f:
|
||
return f.read()
|
||
|
||
assert check_data(data) == data
|
||
assert gzip.decompress(check_data(data, compression='gzip')) == data
|
||
|
||
|
||
def test_output_stream_errors(tmpdir):
|
||
buf = memoryview(bytearray())
|
||
with pytest.raises(ValueError):
|
||
pa.output_stream(buf, compression="foo")
|
||
|
||
for arg in [bytearray(), StringIO()]:
|
||
with pytest.raises(TypeError):
|
||
pa.output_stream(arg)
|
||
|
||
fn = str(tmpdir / 'new_file')
|
||
with open(fn, 'wb') as f:
|
||
pass
|
||
with open(fn, 'rb') as f:
|
||
with pytest.raises(TypeError, match="writable file expected"):
|
||
pa.output_stream(f)
|