2022-05-23 00:16:32 +04:00

1923 lines
53 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import bz2
from contextlib import contextmanager
from io import (BytesIO, StringIO, TextIOWrapper, BufferedIOBase, IOBase)
import itertools
import gc
import gzip
import os
import pathlib
import pickle
import pytest
import sys
import tempfile
import weakref
import numpy as np
from pyarrow.util import guid
from pyarrow import Codec
import pyarrow as pa
def check_large_seeks(file_factory):
if sys.platform in ('win32', 'darwin'):
pytest.skip("need sparse file support")
try:
filename = tempfile.mktemp(prefix='test_io')
with open(filename, 'wb') as f:
f.truncate(2 ** 32 + 10)
f.seek(2 ** 32 + 5)
f.write(b'mark\n')
with file_factory(filename) as f:
assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
assert f.tell() == 2 ** 32 + 5
assert f.read(5) == b'mark\n'
assert f.tell() == 2 ** 32 + 10
finally:
os.unlink(filename)
@contextmanager
def assert_file_not_found():
with pytest.raises(FileNotFoundError):
yield
# ----------------------------------------------------------------------
# Python file-like objects
def test_python_file_write():
buf = BytesIO()
f = pa.PythonFile(buf)
assert f.tell() == 0
s1 = b'enga\xc3\xb1ado'
s2 = b'foobar'
f.write(s1)
assert f.tell() == len(s1)
f.write(s2)
expected = s1 + s2
result = buf.getvalue()
assert result == expected
assert not f.closed
f.close()
assert f.closed
with pytest.raises(TypeError, match="binary file expected"):
pa.PythonFile(StringIO())
def test_python_file_read():
data = b'some sample data'
buf = BytesIO(data)
f = pa.PythonFile(buf, mode='r')
assert f.size() == len(data)
assert f.tell() == 0
assert f.read(4) == b'some'
assert f.tell() == 4
f.seek(0)
assert f.tell() == 0
f.seek(5)
assert f.tell() == 5
v = f.read(50)
assert v == b'sample data'
assert len(v) == 11
assert f.size() == len(data)
assert not f.closed
f.close()
assert f.closed
with pytest.raises(TypeError, match="binary file expected"):
pa.PythonFile(StringIO(), mode='r')
def test_python_file_read_at():
data = b'some sample data'
buf = BytesIO(data)
f = pa.PythonFile(buf, mode='r')
# test simple read at
v = f.read_at(nbytes=5, offset=3)
assert v == b'e sam'
assert len(v) == 5
# test reading entire file when nbytes > len(file)
w = f.read_at(nbytes=50, offset=0)
assert w == data
assert len(w) == 16
def test_python_file_readall():
data = b'some sample data'
buf = BytesIO(data)
with pa.PythonFile(buf, mode='r') as f:
assert f.readall() == data
def test_python_file_readinto():
length = 10
data = b'some sample data longer than 10'
dst_buf = bytearray(length)
src_buf = BytesIO(data)
with pa.PythonFile(src_buf, mode='r') as f:
assert f.readinto(dst_buf) == 10
assert dst_buf[:length] == data[:length]
assert len(dst_buf) == length
def test_python_file_read_buffer():
length = 10
data = b'0123456798'
dst_buf = bytearray(data)
class DuckReader:
def close(self):
pass
@property
def closed(self):
return False
def read_buffer(self, nbytes):
assert nbytes == length
return memoryview(dst_buf)[:nbytes]
duck_reader = DuckReader()
with pa.PythonFile(duck_reader, mode='r') as f:
buf = f.read_buffer(length)
assert len(buf) == length
assert memoryview(buf).tobytes() == dst_buf[:length]
# buf should point to the same memory, so modyfing it
memoryview(buf)[0] = ord(b'x')
# should modify the original
assert dst_buf[0] == ord(b'x')
def test_python_file_correct_abc():
with pa.PythonFile(BytesIO(b''), mode='r') as f:
assert isinstance(f, BufferedIOBase)
assert isinstance(f, IOBase)
def test_python_file_iterable():
data = b'''line1
line2
line3
'''
buf = BytesIO(data)
buf2 = BytesIO(data)
with pa.PythonFile(buf, mode='r') as f:
for read, expected in zip(f, buf2):
assert read == expected
def test_python_file_large_seeks():
def factory(filename):
return pa.PythonFile(open(filename, 'rb'))
check_large_seeks(factory)
def test_bytes_reader():
# Like a BytesIO, but zero-copy underneath for C++ consumers
data = b'some sample data'
f = pa.BufferReader(data)
assert f.tell() == 0
assert f.size() == len(data)
assert f.read(4) == b'some'
assert f.tell() == 4
f.seek(0)
assert f.tell() == 0
f.seek(0, 2)
assert f.tell() == len(data)
f.seek(5)
assert f.tell() == 5
assert f.read(50) == b'sample data'
assert not f.closed
f.close()
assert f.closed
def test_bytes_reader_non_bytes():
with pytest.raises(TypeError):
pa.BufferReader('some sample data')
def test_bytes_reader_retains_parent_reference():
import gc
# ARROW-421
def get_buffer():
data = b'some sample data' * 1000
reader = pa.BufferReader(data)
reader.seek(5)
return reader.read_buffer(6)
buf = get_buffer()
gc.collect()
assert buf.to_pybytes() == b'sample'
assert buf.parent is not None
def test_python_file_implicit_mode(tmpdir):
path = os.path.join(str(tmpdir), 'foo.txt')
with open(path, 'wb') as f:
pf = pa.PythonFile(f)
assert pf.writable()
assert not pf.readable()
assert not pf.seekable() # PyOutputStream isn't seekable
f.write(b'foobar\n')
with open(path, 'rb') as f:
pf = pa.PythonFile(f)
assert pf.readable()
assert not pf.writable()
assert pf.seekable()
assert pf.read() == b'foobar\n'
bio = BytesIO()
pf = pa.PythonFile(bio)
assert pf.writable()
assert not pf.readable()
assert not pf.seekable()
pf.write(b'foobar\n')
assert bio.getvalue() == b'foobar\n'
def test_python_file_writelines(tmpdir):
lines = [b'line1\n', b'line2\n' b'line3']
path = os.path.join(str(tmpdir), 'foo.txt')
with open(path, 'wb') as f:
try:
f = pa.PythonFile(f, mode='w')
assert f.writable()
f.writelines(lines)
finally:
f.close()
with open(path, 'rb') as f:
try:
f = pa.PythonFile(f, mode='r')
assert f.readable()
assert f.read() == b''.join(lines)
finally:
f.close()
def test_python_file_closing():
bio = BytesIO()
pf = pa.PythonFile(bio)
wr = weakref.ref(pf)
del pf
assert wr() is None # object was destroyed
assert not bio.closed
pf = pa.PythonFile(bio)
pf.close()
assert bio.closed
# ----------------------------------------------------------------------
# Buffers
def check_buffer_pickling(buf):
# Check that buffer survives a pickle roundtrip
for protocol in range(0, pickle.HIGHEST_PROTOCOL + 1):
result = pickle.loads(pickle.dumps(buf, protocol=protocol))
assert len(result) == len(buf)
assert memoryview(result) == memoryview(buf)
assert result.to_pybytes() == buf.to_pybytes()
assert result.is_mutable == buf.is_mutable
def test_buffer_bytes():
val = b'some data'
buf = pa.py_buffer(val)
assert isinstance(buf, pa.Buffer)
assert not buf.is_mutable
assert buf.is_cpu
result = buf.to_pybytes()
assert result == val
check_buffer_pickling(buf)
def test_buffer_null_data():
null_buff = pa.foreign_buffer(address=0, size=0)
assert null_buff.to_pybytes() == b""
assert null_buff.address == 0
# ARROW-16048: we shouldn't expose a NULL address through the Python
# buffer protocol.
m = memoryview(null_buff)
assert m.tobytes() == b""
assert pa.py_buffer(m).address != 0
check_buffer_pickling(null_buff)
def test_buffer_memoryview():
val = b'some data'
buf = pa.py_buffer(val)
assert isinstance(buf, pa.Buffer)
assert not buf.is_mutable
assert buf.is_cpu
result = memoryview(buf)
assert result == val
check_buffer_pickling(buf)
def test_buffer_bytearray():
val = bytearray(b'some data')
buf = pa.py_buffer(val)
assert isinstance(buf, pa.Buffer)
assert buf.is_mutable
assert buf.is_cpu
result = bytearray(buf)
assert result == val
check_buffer_pickling(buf)
def test_buffer_invalid():
with pytest.raises(TypeError,
match="(bytes-like object|buffer interface)"):
pa.py_buffer(None)
def test_buffer_weakref():
buf = pa.py_buffer(b'some data')
wr = weakref.ref(buf)
assert wr() is not None
del buf
assert wr() is None
@pytest.mark.parametrize('val, expected_hex_buffer',
[(b'check', b'636865636B'),
(b'\a0', b'0730'),
(b'', b'')])
def test_buffer_hex(val, expected_hex_buffer):
buf = pa.py_buffer(val)
assert buf.hex() == expected_hex_buffer
def test_buffer_to_numpy():
# Make sure creating a numpy array from an arrow buffer works
byte_array = bytearray(20)
byte_array[0] = 42
buf = pa.py_buffer(byte_array)
array = np.frombuffer(buf, dtype="uint8")
assert array[0] == byte_array[0]
byte_array[0] += 1
assert array[0] == byte_array[0]
assert array.base == buf
def test_buffer_from_numpy():
# C-contiguous
arr = np.arange(12, dtype=np.int8).reshape((3, 4))
buf = pa.py_buffer(arr)
assert buf.is_cpu
assert buf.is_mutable
assert buf.to_pybytes() == arr.tobytes()
# F-contiguous; note strides information is lost
buf = pa.py_buffer(arr.T)
assert buf.is_cpu
assert buf.is_mutable
assert buf.to_pybytes() == arr.tobytes()
# Non-contiguous
with pytest.raises(ValueError, match="not contiguous"):
buf = pa.py_buffer(arr.T[::2])
def test_buffer_address():
b1 = b'some data!'
b2 = bytearray(b1)
b3 = bytearray(b1)
buf1 = pa.py_buffer(b1)
buf2 = pa.py_buffer(b1)
buf3 = pa.py_buffer(b2)
buf4 = pa.py_buffer(b3)
assert buf1.address > 0
assert buf1.address == buf2.address
assert buf3.address != buf2.address
assert buf4.address != buf3.address
arr = np.arange(5)
buf = pa.py_buffer(arr)
assert buf.address == arr.ctypes.data
def test_buffer_equals():
# Buffer.equals() returns true iff the buffers have the same contents
def eq(a, b):
assert a.equals(b)
assert a == b
assert not (a != b)
def ne(a, b):
assert not a.equals(b)
assert not (a == b)
assert a != b
b1 = b'some data!'
b2 = bytearray(b1)
b3 = bytearray(b1)
b3[0] = 42
buf1 = pa.py_buffer(b1)
buf2 = pa.py_buffer(b2)
buf3 = pa.py_buffer(b2)
buf4 = pa.py_buffer(b3)
buf5 = pa.py_buffer(np.frombuffer(b2, dtype=np.int16))
eq(buf1, buf1)
eq(buf1, buf2)
eq(buf2, buf3)
ne(buf2, buf4)
# Data type is indifferent
eq(buf2, buf5)
def test_buffer_eq_bytes():
buf = pa.py_buffer(b'some data')
assert buf == b'some data'
assert buf == bytearray(b'some data')
assert buf != b'some dat1'
with pytest.raises(TypeError):
buf == 'some data'
def test_buffer_getitem():
data = bytearray(b'some data!')
buf = pa.py_buffer(data)
n = len(data)
for ix in range(-n, n - 1):
assert buf[ix] == data[ix]
with pytest.raises(IndexError):
buf[n]
with pytest.raises(IndexError):
buf[-n - 1]
def test_buffer_slicing():
data = b'some data!'
buf = pa.py_buffer(data)
sliced = buf.slice(2)
expected = pa.py_buffer(b'me data!')
assert sliced.equals(expected)
sliced2 = buf.slice(2, 4)
expected2 = pa.py_buffer(b'me d')
assert sliced2.equals(expected2)
# 0 offset
assert buf.slice(0).equals(buf)
# Slice past end of buffer
assert len(buf.slice(len(buf))) == 0
with pytest.raises(IndexError):
buf.slice(-1)
with pytest.raises(IndexError):
buf.slice(len(buf) + 1)
assert buf[11:].to_pybytes() == b""
# Slice stop exceeds buffer length
with pytest.raises(IndexError):
buf.slice(1, len(buf))
assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:]
# Negative length
with pytest.raises(IndexError):
buf.slice(1, -1)
# Test slice notation
assert buf[2:].equals(buf.slice(2))
assert buf[2:5].equals(buf.slice(2, 3))
assert buf[-5:].equals(buf.slice(len(buf) - 5))
assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3))
with pytest.raises(IndexError):
buf[::-1]
with pytest.raises(IndexError):
buf[::2]
n = len(buf)
for start in range(-n * 2, n * 2):
for stop in range(-n * 2, n * 2):
assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop]
def test_buffer_hashing():
# Buffers are unhashable
with pytest.raises(TypeError, match="unhashable"):
hash(pa.py_buffer(b'123'))
def test_buffer_protocol_respects_immutability():
# ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like
# object is mutable by first attempting to get a mutable buffer using
# PyObject_FromBuffer. If that fails, it assumes that the object is
# immutable
a = b'12345'
arrow_ref = pa.py_buffer(a)
numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8)
assert not numpy_ref.flags.writeable
def test_foreign_buffer():
obj = np.array([1, 2], dtype=np.int32)
addr = obj.__array_interface__["data"][0]
size = obj.nbytes
buf = pa.foreign_buffer(addr, size, obj)
wr = weakref.ref(obj)
del obj
assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2]
assert wr() is not None
del buf
assert wr() is None
def test_allocate_buffer():
buf = pa.allocate_buffer(100)
assert buf.size == 100
assert buf.is_mutable
assert buf.parent is None
bit = b'abcde'
writer = pa.FixedSizeBufferWriter(buf)
writer.write(bit)
assert buf.to_pybytes()[:5] == bit
def test_allocate_buffer_resizable():
buf = pa.allocate_buffer(100, resizable=True)
assert isinstance(buf, pa.ResizableBuffer)
buf.resize(200)
assert buf.size == 200
@pytest.mark.parametrize("compression", [
pytest.param(
"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
),
"brotli",
"gzip",
"lz4",
"zstd",
"snappy"
])
def test_compress_decompress(compression):
if not Codec.is_available(compression):
pytest.skip("{} support is not built".format(compression))
INPUT_SIZE = 10000
test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
.astype(np.uint8)
.tobytes())
test_buf = pa.py_buffer(test_data)
compressed_buf = pa.compress(test_buf, codec=compression)
compressed_bytes = pa.compress(test_data, codec=compression,
asbytes=True)
assert isinstance(compressed_bytes, bytes)
decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE,
codec=compression)
decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE,
codec=compression, asbytes=True)
assert isinstance(decompressed_bytes, bytes)
assert decompressed_buf.equals(test_buf)
assert decompressed_bytes == test_data
with pytest.raises(ValueError):
pa.decompress(compressed_bytes, codec=compression)
@pytest.mark.parametrize("compression", [
pytest.param(
"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
),
"brotli",
"gzip",
"lz4",
"zstd",
"snappy"
])
def test_compression_level(compression):
if not Codec.is_available(compression):
pytest.skip("{} support is not built".format(compression))
# These codecs do not support a compression level
no_level = ['snappy']
if compression in no_level:
assert not Codec.supports_compression_level(compression)
with pytest.raises(ValueError):
Codec(compression, 0)
with pytest.raises(ValueError):
Codec.minimum_compression_level(compression)
with pytest.raises(ValueError):
Codec.maximum_compression_level(compression)
with pytest.raises(ValueError):
Codec.default_compression_level(compression)
return
INPUT_SIZE = 10000
test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
.astype(np.uint8)
.tobytes())
test_buf = pa.py_buffer(test_data)
min_level = Codec.minimum_compression_level(compression)
max_level = Codec.maximum_compression_level(compression)
default_level = Codec.default_compression_level(compression)
assert min_level < max_level
assert default_level >= min_level
assert default_level <= max_level
for compression_level in range(min_level, max_level+1):
codec = Codec(compression, compression_level)
compressed_buf = codec.compress(test_buf)
compressed_bytes = codec.compress(test_data, asbytes=True)
assert isinstance(compressed_bytes, bytes)
decompressed_buf = codec.decompress(compressed_buf, INPUT_SIZE)
decompressed_bytes = codec.decompress(compressed_bytes, INPUT_SIZE,
asbytes=True)
assert isinstance(decompressed_bytes, bytes)
assert decompressed_buf.equals(test_buf)
assert decompressed_bytes == test_data
with pytest.raises(ValueError):
codec.decompress(compressed_bytes)
# The ability to set a seed this way is not present on older versions of
# numpy (currently in our python 3.6 CI build). Some inputs might just
# happen to compress the same between the two levels so using seeded
# random numbers is necessary to help get more reliable results
#
# The goal of this part is to ensure the compression_level is being
# passed down to the C++ layer, not to verify the compression algs
# themselves
if not hasattr(np.random, 'default_rng'):
pytest.skip('Requires newer version of numpy')
rng = np.random.default_rng(seed=42)
values = rng.integers(0, 100, 1000)
arr = pa.array(values)
hard_to_compress_buffer = arr.buffers()[1]
weak_codec = Codec(compression, min_level)
weakly_compressed_buf = weak_codec.compress(hard_to_compress_buffer)
strong_codec = Codec(compression, max_level)
strongly_compressed_buf = strong_codec.compress(hard_to_compress_buffer)
assert len(weakly_compressed_buf) > len(strongly_compressed_buf)
def test_buffer_memoryview_is_immutable():
val = b'some data'
buf = pa.py_buffer(val)
assert not buf.is_mutable
assert isinstance(buf, pa.Buffer)
result = memoryview(buf)
assert result.readonly
with pytest.raises(TypeError) as exc:
result[0] = b'h'
assert 'cannot modify read-only' in str(exc.value)
b = bytes(buf)
with pytest.raises(TypeError) as exc:
b[0] = b'h'
assert 'cannot modify read-only' in str(exc.value)
def test_uninitialized_buffer():
# ARROW-2039: calling Buffer() directly creates an uninitialized object
# ARROW-2638: prevent calling extension class constructors directly
with pytest.raises(TypeError):
pa.Buffer()
def test_memory_output_stream():
# 10 bytes
val = b'dataabcdef'
f = pa.BufferOutputStream()
K = 1000
for i in range(K):
f.write(val)
buf = f.getvalue()
assert len(buf) == len(val) * K
assert buf.to_pybytes() == val * K
def test_inmemory_write_after_closed():
f = pa.BufferOutputStream()
f.write(b'ok')
assert not f.closed
f.getvalue()
assert f.closed
with pytest.raises(ValueError):
f.write(b'not ok')
def test_buffer_protocol_ref_counting():
def make_buffer(bytes_obj):
return bytearray(pa.py_buffer(bytes_obj))
buf = make_buffer(b'foo')
gc.collect()
assert buf == b'foo'
# ARROW-1053
val = b'foo'
refcount_before = sys.getrefcount(val)
for i in range(10):
make_buffer(val)
gc.collect()
assert refcount_before == sys.getrefcount(val)
def test_nativefile_write_memoryview():
f = pa.BufferOutputStream()
data = b'ok'
arr = np.frombuffer(data, dtype='S1')
f.write(arr)
f.write(bytearray(data))
f.write(pa.py_buffer(data))
with pytest.raises(TypeError):
f.write(data.decode('utf8'))
buf = f.getvalue()
assert buf.to_pybytes() == data * 3
# ----------------------------------------------------------------------
# Mock output stream
def test_mock_output_stream():
# Make sure that the MockOutputStream and the BufferOutputStream record the
# same size
# 10 bytes
val = b'dataabcdef'
f1 = pa.MockOutputStream()
f2 = pa.BufferOutputStream()
K = 1000
for i in range(K):
f1.write(val)
f2.write(val)
assert f1.size() == len(f2.getvalue())
# Do the same test with a table
record_batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], ['a'])
f1 = pa.MockOutputStream()
f2 = pa.BufferOutputStream()
stream_writer1 = pa.RecordBatchStreamWriter(f1, record_batch.schema)
stream_writer2 = pa.RecordBatchStreamWriter(f2, record_batch.schema)
stream_writer1.write_batch(record_batch)
stream_writer2.write_batch(record_batch)
stream_writer1.close()
stream_writer2.close()
assert f1.size() == len(f2.getvalue())
# ----------------------------------------------------------------------
# OS files and memory maps
@pytest.fixture
def sample_disk_data(request, tmpdir):
SIZE = 4096
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
data = arr.tobytes()[:SIZE]
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
f.write(data)
def teardown():
_try_delete(path)
request.addfinalizer(teardown)
return path, data
def _check_native_file_reader(FACTORY, sample_data,
allow_read_out_of_bounds=True):
path, data = sample_data
f = FACTORY(path, mode='r')
assert f.read(10) == data[:10]
assert f.read(0) == b''
assert f.tell() == 10
assert f.read() == data[10:]
assert f.size() == len(data)
f.seek(0)
assert f.tell() == 0
# Seeking past end of file not supported in memory maps
if allow_read_out_of_bounds:
f.seek(len(data) + 1)
assert f.tell() == len(data) + 1
assert f.read(5) == b''
# Test whence argument of seek, ARROW-1287
assert f.seek(3) == 3
assert f.seek(3, os.SEEK_CUR) == 6
assert f.tell() == 6
ex_length = len(data) - 2
assert f.seek(-2, os.SEEK_END) == ex_length
assert f.tell() == ex_length
def test_memory_map_reader(sample_disk_data):
_check_native_file_reader(pa.memory_map, sample_disk_data,
allow_read_out_of_bounds=False)
def test_memory_map_retain_buffer_reference(sample_disk_data):
path, data = sample_disk_data
cases = []
with pa.memory_map(path, 'rb') as f:
cases.append((f.read_buffer(100), data[:100]))
cases.append((f.read_buffer(100), data[100:200]))
cases.append((f.read_buffer(100), data[200:300]))
# Call gc.collect() for good measure
gc.collect()
for buf, expected in cases:
assert buf.to_pybytes() == expected
def test_os_file_reader(sample_disk_data):
_check_native_file_reader(pa.OSFile, sample_disk_data)
def test_os_file_large_seeks():
check_large_seeks(pa.OSFile)
def _try_delete(path):
try:
os.remove(path)
except os.error:
pass
def test_memory_map_writer(tmpdir):
SIZE = 4096
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
data = arr.tobytes()[:SIZE]
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
f.write(data)
f = pa.memory_map(path, mode='r+b')
f.seek(10)
f.write(b'peekaboo')
assert f.tell() == 18
f.seek(10)
assert f.read(8) == b'peekaboo'
f2 = pa.memory_map(path, mode='r+b')
f2.seek(10)
f2.write(b'booapeak')
f2.seek(10)
f.seek(10)
assert f.read(8) == b'booapeak'
# Does not truncate file
f3 = pa.memory_map(path, mode='w')
f3.write(b'foo')
with pa.memory_map(path) as f4:
assert f4.size() == SIZE
with pytest.raises(IOError):
f3.read(5)
f.seek(0)
assert f.read(3) == b'foo'
def test_memory_map_resize(tmpdir):
SIZE = 4096
arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8)
data1 = arr.tobytes()[:(SIZE // 2)]
data2 = arr.tobytes()[(SIZE // 2):]
path = os.path.join(str(tmpdir), guid())
mmap = pa.create_memory_map(path, SIZE / 2)
mmap.write(data1)
mmap.resize(SIZE)
mmap.write(data2)
mmap.close()
with open(path, 'rb') as f:
assert f.read() == arr.tobytes()
def test_memory_zero_length(tmpdir):
path = os.path.join(str(tmpdir), guid())
f = open(path, 'wb')
f.close()
with pa.memory_map(path, mode='r+b') as memory_map:
assert memory_map.size() == 0
def test_memory_map_large_seeks():
check_large_seeks(pa.memory_map)
def test_memory_map_close_remove(tmpdir):
# ARROW-6740: should be able to delete closed memory-mapped file (Windows)
path = os.path.join(str(tmpdir), guid())
mmap = pa.create_memory_map(path, 4096)
mmap.close()
assert mmap.closed
os.remove(path) # Shouldn't fail
def test_memory_map_deref_remove(tmpdir):
path = os.path.join(str(tmpdir), guid())
pa.create_memory_map(path, 4096)
os.remove(path) # Shouldn't fail
def test_os_file_writer(tmpdir):
SIZE = 4096
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
data = arr.tobytes()[:SIZE]
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
f.write(data)
# Truncates file
f2 = pa.OSFile(path, mode='w')
f2.write(b'foo')
with pa.OSFile(path) as f3:
assert f3.size() == 3
with pytest.raises(IOError):
f2.read(5)
def test_native_file_write_reject_unicode():
# ARROW-3227
nf = pa.BufferOutputStream()
with pytest.raises(TypeError):
nf.write('foo')
def test_native_file_modes(tmpdir):
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
f.write(b'foooo')
with pa.OSFile(path, mode='r') as f:
assert f.mode == 'rb'
assert f.readable()
assert not f.writable()
assert f.seekable()
with pa.OSFile(path, mode='rb') as f:
assert f.mode == 'rb'
assert f.readable()
assert not f.writable()
assert f.seekable()
with pa.OSFile(path, mode='w') as f:
assert f.mode == 'wb'
assert not f.readable()
assert f.writable()
assert not f.seekable()
with pa.OSFile(path, mode='wb') as f:
assert f.mode == 'wb'
assert not f.readable()
assert f.writable()
assert not f.seekable()
with open(path, 'wb') as f:
f.write(b'foooo')
with pa.memory_map(path, 'r') as f:
assert f.mode == 'rb'
assert f.readable()
assert not f.writable()
assert f.seekable()
with pa.memory_map(path, 'r+') as f:
assert f.mode == 'rb+'
assert f.readable()
assert f.writable()
assert f.seekable()
with pa.memory_map(path, 'r+b') as f:
assert f.mode == 'rb+'
assert f.readable()
assert f.writable()
assert f.seekable()
def test_native_file_permissions(tmpdir):
# ARROW-10124: permissions of created files should follow umask
cur_umask = os.umask(0o002)
os.umask(cur_umask)
path = os.path.join(str(tmpdir), guid())
with pa.OSFile(path, mode='w'):
pass
assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
path = os.path.join(str(tmpdir), guid())
with pa.memory_map(path, 'w'):
pass
assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask
def test_native_file_raises_ValueError_after_close(tmpdir):
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
f.write(b'foooo')
with pa.OSFile(path, mode='rb') as os_file:
assert not os_file.closed
assert os_file.closed
with pa.memory_map(path, mode='rb') as mmap_file:
assert not mmap_file.closed
assert mmap_file.closed
files = [os_file,
mmap_file]
methods = [('tell', ()),
('seek', (0,)),
('size', ()),
('flush', ()),
('readable', ()),
('writable', ()),
('seekable', ())]
for f in files:
for method, args in methods:
with pytest.raises(ValueError):
getattr(f, method)(*args)
def test_native_file_TextIOWrapper(tmpdir):
data = ('foooo\n'
'barrr\n'
'bazzz\n')
path = os.path.join(str(tmpdir), guid())
with open(path, 'wb') as f:
f.write(data.encode('utf-8'))
with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
assert fil.readable()
res = fil.read()
assert res == data
assert fil.closed
with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
# Iteration works
lines = list(fil)
assert ''.join(lines) == data
# Writing
path2 = os.path.join(str(tmpdir), guid())
with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil:
assert fil.writable()
fil.write(data)
with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil:
res = fil.read()
assert res == data
def test_native_file_open_error():
with assert_file_not_found():
pa.OSFile('non_existent_file', 'rb')
with assert_file_not_found():
pa.memory_map('non_existent_file', 'rb')
# ----------------------------------------------------------------------
# Buffered streams
def test_buffered_input_stream():
raw = pa.BufferReader(b"123456789")
f = pa.BufferedInputStream(raw, buffer_size=4)
assert f.read(2) == b"12"
assert raw.tell() == 4
f.close()
assert f.closed
assert raw.closed
def test_buffered_input_stream_detach_seekable():
# detach() to a seekable file (io::RandomAccessFile in C++)
f = pa.BufferedInputStream(pa.BufferReader(b"123456789"), buffer_size=4)
assert f.read(2) == b"12"
raw = f.detach()
assert f.closed
assert not raw.closed
assert raw.seekable()
assert raw.read(4) == b"5678"
raw.seek(2)
assert raw.read(4) == b"3456"
def test_buffered_input_stream_detach_non_seekable():
# detach() to a non-seekable file (io::InputStream in C++)
f = pa.BufferedInputStream(
pa.BufferedInputStream(pa.BufferReader(b"123456789"), buffer_size=4),
buffer_size=4)
assert f.read(2) == b"12"
raw = f.detach()
assert f.closed
assert not raw.closed
assert not raw.seekable()
assert raw.read(4) == b"5678"
with pytest.raises(EnvironmentError):
raw.seek(2)
def test_buffered_output_stream():
np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
buf = pa.py_buffer(np_buf)
raw = pa.FixedSizeBufferWriter(buf)
f = pa.BufferedOutputStream(raw, buffer_size=4)
f.write(b"12")
assert np_buf[:4].tobytes() == b'\0\0\0\0'
f.flush()
assert np_buf[:4].tobytes() == b'12\0\0'
f.write(b"3456789")
f.close()
assert f.closed
assert raw.closed
assert np_buf[:10].tobytes() == b'123456789\0'
def test_buffered_output_stream_detach():
np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
buf = pa.py_buffer(np_buf)
f = pa.BufferedOutputStream(pa.FixedSizeBufferWriter(buf), buffer_size=4)
f.write(b"12")
assert np_buf[:4].tobytes() == b'\0\0\0\0'
raw = f.detach()
assert f.closed
assert not raw.closed
assert np_buf[:4].tobytes() == b'12\0\0'
# ----------------------------------------------------------------------
# Compressed input and output streams
def check_compressed_input(data, fn, compression):
raw = pa.OSFile(fn, mode="rb")
with pa.CompressedInputStream(raw, compression) as compressed:
assert not compressed.closed
assert compressed.readable()
assert not compressed.writable()
assert not compressed.seekable()
got = compressed.read()
assert got == data
assert compressed.closed
assert raw.closed
# Same with read_buffer()
raw = pa.OSFile(fn, mode="rb")
with pa.CompressedInputStream(raw, compression) as compressed:
buf = compressed.read_buffer()
assert isinstance(buf, pa.Buffer)
assert buf.to_pybytes() == data
@pytest.mark.gzip
def test_compressed_input_gzip(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
fn = str(tmpdir / "compressed_input_test.gz")
with gzip.open(fn, "wb") as f:
f.write(data)
check_compressed_input(data, fn, "gzip")
def test_compressed_input_bz2(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
fn = str(tmpdir / "compressed_input_test.bz2")
with bz2.BZ2File(fn, "w") as f:
f.write(data)
try:
check_compressed_input(data, fn, "bz2")
except NotImplementedError as e:
pytest.skip(str(e))
@pytest.mark.gzip
def test_compressed_input_openfile(tmpdir):
if not Codec.is_available("gzip"):
pytest.skip("gzip support is not built")
data = b"some test data\n" * 10 + b"eof\n"
fn = str(tmpdir / "test_compressed_input_openfile.gz")
with gzip.open(fn, "wb") as f:
f.write(data)
with pa.CompressedInputStream(fn, "gzip") as compressed:
buf = compressed.read_buffer()
assert buf.to_pybytes() == data
assert compressed.closed
with pa.CompressedInputStream(pathlib.Path(fn), "gzip") as compressed:
buf = compressed.read_buffer()
assert buf.to_pybytes() == data
assert compressed.closed
f = open(fn, "rb")
with pa.CompressedInputStream(f, "gzip") as compressed:
buf = compressed.read_buffer()
assert buf.to_pybytes() == data
assert f.closed
def check_compressed_concatenated(data, fn, compression):
raw = pa.OSFile(fn, mode="rb")
with pa.CompressedInputStream(raw, compression) as compressed:
got = compressed.read()
assert got == data
@pytest.mark.gzip
def test_compressed_concatenated_gzip(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
fn = str(tmpdir / "compressed_input_test2.gz")
with gzip.open(fn, "wb") as f:
f.write(data[:50])
with gzip.open(fn, "ab") as f:
f.write(data[50:])
check_compressed_concatenated(data, fn, "gzip")
@pytest.mark.gzip
def test_compressed_input_invalid():
data = b"foo" * 10
raw = pa.BufferReader(data)
with pytest.raises(ValueError):
pa.CompressedInputStream(raw, "unknown_compression")
with pytest.raises(TypeError):
pa.CompressedInputStream(raw, None)
with pa.CompressedInputStream(raw, "gzip") as compressed:
with pytest.raises(IOError, match="zlib inflate failed"):
compressed.read()
def make_compressed_output(data, fn, compression):
raw = pa.BufferOutputStream()
with pa.CompressedOutputStream(raw, compression) as compressed:
assert not compressed.closed
assert not compressed.readable()
assert compressed.writable()
assert not compressed.seekable()
compressed.write(data)
assert compressed.closed
assert raw.closed
with open(fn, "wb") as f:
f.write(raw.getvalue())
@pytest.mark.gzip
def test_compressed_output_gzip(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
fn = str(tmpdir / "compressed_output_test.gz")
make_compressed_output(data, fn, "gzip")
with gzip.open(fn, "rb") as f:
got = f.read()
assert got == data
def test_compressed_output_bz2(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
fn = str(tmpdir / "compressed_output_test.bz2")
try:
make_compressed_output(data, fn, "bz2")
except NotImplementedError as e:
pytest.skip(str(e))
with bz2.BZ2File(fn, "r") as f:
got = f.read()
assert got == data
def test_output_stream_constructor(tmpdir):
if not Codec.is_available("gzip"):
pytest.skip("gzip support is not built")
with pa.CompressedOutputStream(tmpdir / "ctor.gz", "gzip") as stream:
stream.write(b"test")
with (tmpdir / "ctor2.gz").open("wb") as f:
with pa.CompressedOutputStream(f, "gzip") as stream:
stream.write(b"test")
@pytest.mark.parametrize(("path", "expected_compression"), [
("file.bz2", "bz2"),
("file.lz4", "lz4"),
(pathlib.Path("file.gz"), "gzip"),
(pathlib.Path("path/to/file.zst"), "zstd"),
])
def test_compression_detection(path, expected_compression):
if not Codec.is_available(expected_compression):
with pytest.raises(pa.lib.ArrowNotImplementedError):
Codec.detect(path)
else:
codec = Codec.detect(path)
assert isinstance(codec, Codec)
assert codec.name == expected_compression
def test_unknown_compression_raises():
with pytest.raises(ValueError):
Codec.is_available('unknown')
with pytest.raises(TypeError):
Codec(None)
with pytest.raises(ValueError):
Codec('unknown')
@pytest.mark.parametrize("compression", [
"bz2",
"brotli",
"gzip",
"lz4",
"zstd",
pytest.param(
"snappy",
marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
)
])
def test_compressed_roundtrip(compression):
if not Codec.is_available(compression):
pytest.skip("{} support is not built".format(compression))
data = b"some test data\n" * 10 + b"eof\n"
raw = pa.BufferOutputStream()
with pa.CompressedOutputStream(raw, compression) as compressed:
compressed.write(data)
cdata = raw.getvalue()
assert len(cdata) < len(data)
raw = pa.BufferReader(cdata)
with pa.CompressedInputStream(raw, compression) as compressed:
got = compressed.read()
assert got == data
@pytest.mark.parametrize(
"compression",
["bz2", "brotli", "gzip", "lz4", "zstd"]
)
def test_compressed_recordbatch_stream(compression):
if not Codec.is_available(compression):
pytest.skip("{} support is not built".format(compression))
# ARROW-4836: roundtrip a RecordBatch through a compressed stream
table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
raw = pa.BufferOutputStream()
stream = pa.CompressedOutputStream(raw, compression)
writer = pa.RecordBatchStreamWriter(stream, table.schema)
writer.write_table(table, max_chunksize=3)
writer.close()
stream.close() # Flush data
buf = raw.getvalue()
stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
got_table = pa.RecordBatchStreamReader(stream).read_all()
assert got_table == table
# ----------------------------------------------------------------------
# Transform input streams
unicode_transcoding_example = (
"Dès Noël où un zéphyr haï me vêt de glaçons würmiens "
"je dîne dexquis rôtis de bœuf au kir à laÿ dâge mûr & cætera !"
)
def check_transcoding(data, src_encoding, dest_encoding, chunk_sizes):
chunk_sizes = iter(chunk_sizes)
stream = pa.transcoding_input_stream(
pa.BufferReader(data.encode(src_encoding)),
src_encoding, dest_encoding)
out = []
while True:
buf = stream.read(next(chunk_sizes))
out.append(buf)
if not buf:
break
out = b''.join(out)
assert out.decode(dest_encoding) == data
@pytest.mark.parametrize('src_encoding, dest_encoding',
[('utf-8', 'utf-16'),
('utf-16', 'utf-8'),
('utf-8', 'utf-32-le'),
('utf-8', 'utf-32-be'),
])
def test_transcoding_input_stream(src_encoding, dest_encoding):
# All at once
check_transcoding(unicode_transcoding_example,
src_encoding, dest_encoding, [1000, 0])
# Incremental
check_transcoding(unicode_transcoding_example,
src_encoding, dest_encoding,
itertools.cycle([1, 2, 3, 5]))
@pytest.mark.parametrize('src_encoding, dest_encoding',
[('utf-8', 'utf-8'),
('utf-8', 'UTF8')])
def test_transcoding_no_ops(src_encoding, dest_encoding):
# No indirection is wasted when a trivial transcoding is requested
stream = pa.BufferReader(b"abc123")
assert pa.transcoding_input_stream(
stream, src_encoding, dest_encoding) is stream
@pytest.mark.parametrize('src_encoding, dest_encoding',
[('utf-8', 'ascii'),
('utf-8', 'latin-1'),
])
def test_transcoding_encoding_error(src_encoding, dest_encoding):
# Character \u0100 cannot be represented in the destination encoding
stream = pa.transcoding_input_stream(
pa.BufferReader("\u0100".encode(src_encoding)),
src_encoding,
dest_encoding)
with pytest.raises(UnicodeEncodeError):
stream.read(1)
@pytest.mark.parametrize('src_encoding, dest_encoding',
[('utf-8', 'utf-16'),
('utf-16', 'utf-8'),
])
def test_transcoding_decoding_error(src_encoding, dest_encoding):
# The given bytestring is not valid in the source encoding
stream = pa.transcoding_input_stream(
pa.BufferReader(b"\xff\xff\xff\xff"),
src_encoding,
dest_encoding)
with pytest.raises(UnicodeError):
stream.read(1)
# ----------------------------------------------------------------------
# High-level API
@pytest.mark.gzip
def test_input_stream_buffer():
data = b"some test data\n" * 10 + b"eof\n"
for arg in [pa.py_buffer(data), memoryview(data)]:
stream = pa.input_stream(arg)
assert stream.read() == data
gz_data = gzip.compress(data)
stream = pa.input_stream(memoryview(gz_data))
assert stream.read() == gz_data
stream = pa.input_stream(memoryview(gz_data), compression='gzip')
assert stream.read() == data
def test_input_stream_duck_typing():
# Accept objects having the right file-like methods...
class DuckReader:
def close(self):
pass
@property
def closed(self):
return False
def read(self, nbytes=None):
return b'hello'
stream = pa.input_stream(DuckReader())
assert stream.read(5) == b'hello'
def test_input_stream_file_path(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
file_path = tmpdir / 'input_stream'
with open(str(file_path), 'wb') as f:
f.write(data)
stream = pa.input_stream(file_path)
assert stream.read() == data
stream = pa.input_stream(str(file_path))
assert stream.read() == data
stream = pa.input_stream(pathlib.Path(str(file_path)))
assert stream.read() == data
@pytest.mark.gzip
def test_input_stream_file_path_compressed(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
gz_data = gzip.compress(data)
file_path = tmpdir / 'input_stream.gz'
with open(str(file_path), 'wb') as f:
f.write(gz_data)
stream = pa.input_stream(file_path)
assert stream.read() == data
stream = pa.input_stream(str(file_path))
assert stream.read() == data
stream = pa.input_stream(pathlib.Path(str(file_path)))
assert stream.read() == data
stream = pa.input_stream(file_path, compression='gzip')
assert stream.read() == data
stream = pa.input_stream(file_path, compression=None)
assert stream.read() == gz_data
def test_input_stream_file_path_buffered(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
file_path = tmpdir / 'input_stream.buffered'
with open(str(file_path), 'wb') as f:
f.write(data)
stream = pa.input_stream(file_path, buffer_size=32)
assert isinstance(stream, pa.BufferedInputStream)
assert stream.read() == data
stream = pa.input_stream(str(file_path), buffer_size=64)
assert isinstance(stream, pa.BufferedInputStream)
assert stream.read() == data
stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
assert isinstance(stream, pa.BufferedInputStream)
assert stream.read() == data
unbuffered_stream = pa.input_stream(file_path, buffer_size=0)
assert isinstance(unbuffered_stream, pa.OSFile)
msg = 'Buffer size must be larger than zero'
with pytest.raises(ValueError, match=msg):
pa.input_stream(file_path, buffer_size=-1)
with pytest.raises(TypeError):
pa.input_stream(file_path, buffer_size='million')
@pytest.mark.gzip
def test_input_stream_file_path_compressed_and_buffered(tmpdir):
data = b"some test data\n" * 100 + b"eof\n"
gz_data = gzip.compress(data)
file_path = tmpdir / 'input_stream_compressed_and_buffered.gz'
with open(str(file_path), 'wb') as f:
f.write(gz_data)
stream = pa.input_stream(file_path, buffer_size=32, compression='gzip')
assert stream.read() == data
stream = pa.input_stream(str(file_path), buffer_size=64)
assert stream.read() == data
stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
assert stream.read() == data
@pytest.mark.gzip
def test_input_stream_python_file(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
bio = BytesIO(data)
stream = pa.input_stream(bio)
assert stream.read() == data
gz_data = gzip.compress(data)
bio = BytesIO(gz_data)
stream = pa.input_stream(bio)
assert stream.read() == gz_data
bio.seek(0)
stream = pa.input_stream(bio, compression='gzip')
assert stream.read() == data
file_path = tmpdir / 'input_stream'
with open(str(file_path), 'wb') as f:
f.write(data)
with open(str(file_path), 'rb') as f:
stream = pa.input_stream(f)
assert stream.read() == data
@pytest.mark.gzip
def test_input_stream_native_file():
data = b"some test data\n" * 10 + b"eof\n"
gz_data = gzip.compress(data)
reader = pa.BufferReader(gz_data)
stream = pa.input_stream(reader)
assert stream is reader
reader = pa.BufferReader(gz_data)
stream = pa.input_stream(reader, compression='gzip')
assert stream.read() == data
def test_input_stream_errors(tmpdir):
buf = memoryview(b"")
with pytest.raises(ValueError):
pa.input_stream(buf, compression="foo")
for arg in [bytearray(), StringIO()]:
with pytest.raises(TypeError):
pa.input_stream(arg)
with assert_file_not_found():
pa.input_stream("non_existent_file")
with open(str(tmpdir / 'new_file'), 'wb') as f:
with pytest.raises(TypeError, match="readable file expected"):
pa.input_stream(f)
def test_output_stream_buffer():
data = b"some test data\n" * 10 + b"eof\n"
buf = bytearray(len(data))
stream = pa.output_stream(pa.py_buffer(buf))
stream.write(data)
assert buf == data
buf = bytearray(len(data))
stream = pa.output_stream(memoryview(buf))
stream.write(data)
assert buf == data
def test_output_stream_duck_typing():
# Accept objects having the right file-like methods...
class DuckWriter:
def __init__(self):
self.buf = pa.BufferOutputStream()
def close(self):
pass
@property
def closed(self):
return False
def write(self, data):
self.buf.write(data)
duck_writer = DuckWriter()
stream = pa.output_stream(duck_writer)
assert stream.write(b'hello')
assert duck_writer.buf.getvalue().to_pybytes() == b'hello'
def test_output_stream_file_path(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
file_path = tmpdir / 'output_stream'
def check_data(file_path, data):
with pa.output_stream(file_path) as stream:
stream.write(data)
with open(str(file_path), 'rb') as f:
assert f.read() == data
check_data(file_path, data)
check_data(str(file_path), data)
check_data(pathlib.Path(str(file_path)), data)
@pytest.mark.gzip
def test_output_stream_file_path_compressed(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
file_path = tmpdir / 'output_stream.gz'
def check_data(file_path, data, **kwargs):
with pa.output_stream(file_path, **kwargs) as stream:
stream.write(data)
with open(str(file_path), 'rb') as f:
return f.read()
assert gzip.decompress(check_data(file_path, data)) == data
assert gzip.decompress(check_data(str(file_path), data)) == data
assert gzip.decompress(
check_data(pathlib.Path(str(file_path)), data)) == data
assert gzip.decompress(
check_data(file_path, data, compression='gzip')) == data
assert check_data(file_path, data, compression=None) == data
with pytest.raises(ValueError, match='Invalid value for compression'):
assert check_data(file_path, data, compression='rabbit') == data
def test_output_stream_file_path_buffered(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
file_path = tmpdir / 'output_stream.buffered'
def check_data(file_path, data, **kwargs):
with pa.output_stream(file_path, **kwargs) as stream:
if kwargs.get('buffer_size', 0) > 0:
assert isinstance(stream, pa.BufferedOutputStream)
stream.write(data)
with open(str(file_path), 'rb') as f:
return f.read()
unbuffered_stream = pa.output_stream(file_path, buffer_size=0)
assert isinstance(unbuffered_stream, pa.OSFile)
msg = 'Buffer size must be larger than zero'
with pytest.raises(ValueError, match=msg):
assert check_data(file_path, data, buffer_size=-128) == data
assert check_data(file_path, data, buffer_size=32) == data
assert check_data(file_path, data, buffer_size=1024) == data
assert check_data(str(file_path), data, buffer_size=32) == data
result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32)
assert result == data
@pytest.mark.gzip
def test_output_stream_file_path_compressed_and_buffered(tmpdir):
data = b"some test data\n" * 100 + b"eof\n"
file_path = tmpdir / 'output_stream_compressed_and_buffered.gz'
def check_data(file_path, data, **kwargs):
with pa.output_stream(file_path, **kwargs) as stream:
stream.write(data)
with open(str(file_path), 'rb') as f:
return f.read()
result = check_data(file_path, data, buffer_size=32)
assert gzip.decompress(result) == data
result = check_data(file_path, data, buffer_size=1024)
assert gzip.decompress(result) == data
result = check_data(file_path, data, buffer_size=1024, compression='gzip')
assert gzip.decompress(result) == data
def test_output_stream_destructor(tmpdir):
# The wrapper returned by pa.output_stream() should respect Python
# file semantics, i.e. destroying it should close the underlying
# file cleanly.
data = b"some test data\n"
file_path = tmpdir / 'output_stream.buffered'
def check_data(file_path, data, **kwargs):
stream = pa.output_stream(file_path, **kwargs)
stream.write(data)
del stream
gc.collect()
with open(str(file_path), 'rb') as f:
return f.read()
assert check_data(file_path, data, buffer_size=0) == data
assert check_data(file_path, data, buffer_size=1024) == data
@pytest.mark.gzip
def test_output_stream_python_file(tmpdir):
data = b"some test data\n" * 10 + b"eof\n"
def check_data(data, **kwargs):
# XXX cannot use BytesIO because stream.close() is necessary
# to finish writing compressed data, but it will also close the
# underlying BytesIO
fn = str(tmpdir / 'output_stream_file')
with open(fn, 'wb') as f:
with pa.output_stream(f, **kwargs) as stream:
stream.write(data)
with open(fn, 'rb') as f:
return f.read()
assert check_data(data) == data
assert gzip.decompress(check_data(data, compression='gzip')) == data
def test_output_stream_errors(tmpdir):
buf = memoryview(bytearray())
with pytest.raises(ValueError):
pa.output_stream(buf, compression="foo")
for arg in [bytearray(), StringIO()]:
with pytest.raises(TypeError):
pa.output_stream(arg)
fn = str(tmpdir / 'new_file')
with open(fn, 'wb') as f:
pass
with open(fn, 'rb') as f:
with pytest.raises(TypeError, match="writable file expected"):
pa.output_stream(f)