first commit

This commit is contained in:
Ayxan
2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
"""Intialize the smmap package"""
__author__ = "Sebastian Thiel"
__contact__ = "byronimo@gmail.com"
__homepage__ = "https://github.com/gitpython-developers/smmap"
version_info = (5, 0, 0)
__version__ = '.'.join(str(i) for i in version_info)
# make everything available in root package for convenience
from .mman import *
from .buf import *

View File

@@ -0,0 +1,143 @@
"""Module with a simple buffer implementation using the memory manager"""
import sys
__all__ = ["SlidingWindowMapBuffer"]
class SlidingWindowMapBuffer:
"""A buffer like object which allows direct byte-wise object and slicing into
memory of a mapped file. The mapping is controlled by the provided cursor.
The buffer is relative, that is if you map an offset, index 0 will map to the
first byte at the offset you used during initialization or begin_access
**Note:** Although this type effectively hides the fact that there are mapped windows
underneath, it can unfortunately not be used in any non-pure python method which
needs a buffer or string"""
__slots__ = (
'_c', # our cursor
'_size', # our supposed size
)
def __init__(self, cursor=None, offset=0, size=sys.maxsize, flags=0):
"""Initalize the instance to operate on the given cursor.
:param cursor: if not None, the associated cursor to the file you want to access
If None, you have call begin_access before using the buffer and provide a cursor
:param offset: absolute offset in bytes
:param size: the total size of the mapping. Defaults to the maximum possible size
From that point on, the __len__ of the buffer will be the given size or the file size.
If the size is larger than the mappable area, you can only access the actually available
area, although the length of the buffer is reported to be your given size.
Hence it is in your own interest to provide a proper size !
:param flags: Additional flags to be passed to os.open
:raise ValueError: if the buffer could not achieve a valid state"""
self._c = cursor
if cursor and not self.begin_access(cursor, offset, size, flags):
raise ValueError("Failed to allocate the buffer - probably the given offset is out of bounds")
# END handle offset
def __del__(self):
self.end_access()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.end_access()
def __len__(self):
return self._size
def __getitem__(self, i):
if isinstance(i, slice):
return self.__getslice__(i.start or 0, i.stop or self._size)
c = self._c
assert c.is_valid()
if i < 0:
i = self._size + i
if not c.includes_ofs(i):
c.use_region(i, 1)
# END handle region usage
return c.buffer()[i - c.ofs_begin()]
def __getslice__(self, i, j):
c = self._c
# fast path, slice fully included - safes a concatenate operation and
# should be the default
assert c.is_valid()
if i < 0:
i = self._size + i
if j == sys.maxsize:
j = self._size
if j < 0:
j = self._size + j
if (c.ofs_begin() <= i) and (j < c.ofs_end()):
b = c.ofs_begin()
return c.buffer()[i - b:j - b]
else:
l = j - i # total length
ofs = i
# It's fastest to keep tokens and join later, especially in py3, which was 7 times slower
# in the previous iteration of this code
md = list()
while l:
c.use_region(ofs, l)
assert c.is_valid()
d = c.buffer()[:l]
ofs += len(d)
l -= len(d)
# Make sure we don't keep references, as c.use_region() might attempt to free resources, but
# can't unless we use pure bytes
if hasattr(d, 'tobytes'):
d = d.tobytes()
md.append(d)
# END while there are bytes to read
return bytes().join(md)
# END fast or slow path
#{ Interface
def begin_access(self, cursor=None, offset=0, size=sys.maxsize, flags=0):
"""Call this before the first use of this instance. The method was already
called by the constructor in case sufficient information was provided.
For more information no the parameters, see the __init__ method
:param path: if cursor is None the existing one will be used.
:return: True if the buffer can be used"""
if cursor:
self._c = cursor
# END update our cursor
# reuse existing cursors if possible
if self._c is not None and self._c.is_associated():
res = self._c.use_region(offset, size, flags).is_valid()
if res:
# if given size is too large or default, we computer a proper size
# If its smaller, we assume the combination between offset and size
# as chosen by the user is correct and use it !
# If not, the user is in trouble.
if size > self._c.file_size():
size = self._c.file_size() - offset
# END handle size
self._size = size
# END set size
return res
# END use our cursor
return False
def end_access(self):
"""Call this method once you are done using the instance. It is automatically
called on destruction, and should be called just in time to allow system
resources to be freed.
Once you called end_access, you must call begin access before reusing this instance!"""
self._size = 0
if self._c is not None:
self._c.unuse_region()
# END unuse region
def cursor(self):
""":return: the currently set cursor which provides access to the data"""
return self._c
#}END interface

View File

@@ -0,0 +1,588 @@
"""Module containing a memory memory manager which provides a sliding window on a number of memory mapped files"""
from .util import (
MapWindow,
MapRegion,
MapRegionList,
is_64_bit,
)
import sys
from functools import reduce
__all__ = ["StaticWindowMapManager", "SlidingWindowMapManager", "WindowCursor"]
#{ Utilities
#}END utilities
class WindowCursor:
"""
Pointer into the mapped region of the memory manager, keeping the map
alive until it is destroyed and no other client uses it.
Cursors should not be created manually, but are instead returned by the SlidingWindowMapManager
**Note:**: The current implementation is suited for static and sliding window managers, but it also means
that it must be suited for the somewhat quite different sliding manager. It could be improved, but
I see no real need to do so."""
__slots__ = (
'_manager', # the manger keeping all file regions
'_rlist', # a regions list with regions for our file
'_region', # our current class:`MapRegion` or None
'_ofs', # relative offset from the actually mapped area to our start area
'_size' # maximum size we should provide
)
def __init__(self, manager=None, regions=None):
self._manager = manager
self._rlist = regions
self._region = None
self._ofs = 0
self._size = 0
def __del__(self):
self._destroy()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self._destroy()
def _destroy(self):
"""Destruction code to decrement counters"""
self.unuse_region()
if self._rlist is not None:
# Actual client count, which doesn't include the reference kept by the manager, nor ours
# as we are about to be deleted
try:
if len(self._rlist) == 0:
# Free all resources associated with the mapped file
self._manager._fdict.pop(self._rlist.path_or_fd())
# END remove regions list from manager
except (TypeError, KeyError):
# sometimes, during shutdown, getrefcount is None. Its possible
# to re-import it, however, its probably better to just ignore
# this python problem (for now).
# The next step is to get rid of the error prone getrefcount alltogether.
pass
# END exception handling
# END handle regions
def _copy_from(self, rhs):
"""Copy all data from rhs into this instance, handles usage count"""
self._manager = rhs._manager
self._rlist = type(rhs._rlist)(rhs._rlist)
self._region = rhs._region
self._ofs = rhs._ofs
self._size = rhs._size
for region in self._rlist:
region.increment_client_count()
if self._region is not None:
self._region.increment_client_count()
# END handle regions
def __copy__(self):
"""copy module interface"""
cpy = type(self)()
cpy._copy_from(self)
return cpy
#{ Interface
def assign(self, rhs):
"""Assign rhs to this instance. This is required in order to get a real copy.
Alternativly, you can copy an existing instance using the copy module"""
self._destroy()
self._copy_from(rhs)
def use_region(self, offset=0, size=0, flags=0):
"""Assure we point to a window which allows access to the given offset into the file
:param offset: absolute offset in bytes into the file
:param size: amount of bytes to map. If 0, all available bytes will be mapped
:param flags: additional flags to be given to os.open in case a file handle is initially opened
for mapping. Has no effect if a region can actually be reused.
:return: this instance - it should be queried for whether it points to a valid memory region.
This is not the case if the mapping failed because we reached the end of the file
**Note:**: The size actually mapped may be smaller than the given size. If that is the case,
either the file has reached its end, or the map was created between two existing regions"""
need_region = True
man = self._manager
fsize = self._rlist.file_size()
size = min(size or fsize, man.window_size() or fsize) # clamp size to window size
if self._region is not None:
if self._region.includes_ofs(offset):
need_region = False
else:
self.unuse_region()
# END handle existing region
# END check existing region
# offset too large ?
if offset >= fsize:
return self
# END handle offset
if need_region:
self._region = man._obtain_region(self._rlist, offset, size, flags, False)
self._region.increment_client_count()
# END need region handling
self._ofs = offset - self._region._b
self._size = min(size, self._region.ofs_end() - offset)
return self
def unuse_region(self):
"""Unuse the current region. Does nothing if we have no current region
**Note:** the cursor unuses the region automatically upon destruction. It is recommended
to un-use the region once you are done reading from it in persistent cursors as it
helps to free up resource more quickly"""
if self._region is not None:
self._region.increment_client_count(-1)
self._region = None
# note: should reset ofs and size, but we spare that for performance. Its not
# allowed to query information if we are not valid !
def buffer(self):
"""Return a buffer object which allows access to our memory region from our offset
to the window size. Please note that it might be smaller than you requested when calling use_region()
**Note:** You can only obtain a buffer if this instance is_valid() !
**Note:** buffers should not be cached passed the duration of your access as it will
prevent resources from being freed even though they might not be accounted for anymore !"""
return memoryview(self._region.buffer())[self._ofs:self._ofs+self._size]
def map(self):
"""
:return: the underlying raw memory map. Please not that the offset and size is likely to be different
to what you set as offset and size. Use it only if you are sure about the region it maps, which is the whole
file in case of StaticWindowMapManager"""
return self._region.map()
def is_valid(self):
""":return: True if we have a valid and usable region"""
return self._region is not None
def is_associated(self):
""":return: True if we are associated with a specific file already"""
return self._rlist is not None
def ofs_begin(self):
""":return: offset to the first byte pointed to by our cursor
**Note:** only if is_valid() is True"""
return self._region._b + self._ofs
def ofs_end(self):
""":return: offset to one past the last available byte"""
# unroll method calls for performance !
return self._region._b + self._ofs + self._size
def size(self):
""":return: amount of bytes we point to"""
return self._size
def region(self):
""":return: our mapped region, or None if nothing is mapped yet
:raise AssertionError: if we have no current region. This is only useful for debugging"""
return self._region
def includes_ofs(self, ofs):
""":return: True if the given absolute offset is contained in the cursors
current region
**Note:** cursor must be valid for this to work"""
# unroll methods
return (self._region._b + self._ofs) <= ofs < (self._region._b + self._ofs + self._size)
def file_size(self):
""":return: size of the underlying file"""
return self._rlist.file_size()
def path_or_fd(self):
""":return: path or file descriptor of the underlying mapped file"""
return self._rlist.path_or_fd()
def path(self):
""":return: path of the underlying mapped file
:raise ValueError: if attached path is not a path"""
if isinstance(self._rlist.path_or_fd(), int):
raise ValueError("Path queried although mapping was applied to a file descriptor")
# END handle type
return self._rlist.path_or_fd()
def fd(self):
""":return: file descriptor used to create the underlying mapping.
**Note:** it is not required to be valid anymore
:raise ValueError: if the mapping was not created by a file descriptor"""
if isinstance(self._rlist.path_or_fd(), str):
raise ValueError("File descriptor queried although mapping was generated from path")
# END handle type
return self._rlist.path_or_fd()
#} END interface
class StaticWindowMapManager:
"""Provides a manager which will produce single size cursors that are allowed
to always map the whole file.
Clients must be written to specifically know that they are accessing their data
through a StaticWindowMapManager, as they otherwise have to deal with their window size.
These clients would have to use a SlidingWindowMapBuffer to hide this fact.
This type will always use a maximum window size, and optimize certain methods to
accommodate this fact"""
__slots__ = [
'_fdict', # mapping of path -> StorageHelper (of some kind
'_window_size', # maximum size of a window
'_max_memory_size', # maximum amount of memory we may allocate
'_max_handle_count', # maximum amount of handles to keep open
'_memory_size', # currently allocated memory size
'_handle_count', # amount of currently allocated file handles
]
#{ Configuration
MapRegionListCls = MapRegionList
MapWindowCls = MapWindow
MapRegionCls = MapRegion
WindowCursorCls = WindowCursor
#} END configuration
_MB_in_bytes = 1024 * 1024
def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsize):
"""initialize the manager with the given parameters.
:param window_size: if -1, a default window size will be chosen depending on
the operating system's architecture. It will internally be quantified to a multiple of the page size
If 0, the window may have any size, which basically results in mapping the whole file at one
:param max_memory_size: maximum amount of memory we may map at once before releasing mapped regions.
If 0, a viable default will be set depending on the system's architecture.
It is a soft limit that is tried to be kept, but nothing bad happens if we have to over-allocate
:param max_open_handles: if not maxint, limit the amount of open file handles to the given number.
Otherwise the amount is only limited by the system itself. If a system or soft limit is hit,
the manager will free as many handles as possible"""
self._fdict = dict()
self._window_size = window_size
self._max_memory_size = max_memory_size
self._max_handle_count = max_open_handles
self._memory_size = 0
self._handle_count = 0
if window_size < 0:
coeff = 64
if is_64_bit():
coeff = 1024
# END handle arch
self._window_size = coeff * self._MB_in_bytes
# END handle max window size
if max_memory_size == 0:
coeff = 1024
if is_64_bit():
coeff = 8192
# END handle arch
self._max_memory_size = coeff * self._MB_in_bytes
# END handle max memory size
#{ Internal Methods
def _collect_lru_region(self, size):
"""Unmap the region which was least-recently used and has no client
:param size: size of the region we want to map next (assuming its not already mapped partially or full
if 0, we try to free any available region
:return: Amount of freed regions
.. Note::
We don't raise exceptions anymore, in order to keep the system working, allowing temporary overallocation.
If the system runs out of memory, it will tell.
.. TODO::
implement a case where all unusued regions are discarded efficiently.
Currently its only brute force
"""
num_found = 0
while (size == 0) or (self._memory_size + size > self._max_memory_size):
lru_region = None
lru_list = None
for regions in self._fdict.values():
for region in regions:
# check client count - if it's 1, it's just us
if (region.client_count() == 1 and
(lru_region is None or region._uc < lru_region._uc)):
lru_region = region
lru_list = regions
# END update lru_region
# END for each region
# END for each regions list
if lru_region is None:
break
# END handle region not found
num_found += 1
del(lru_list[lru_list.index(lru_region)])
lru_region.increment_client_count(-1)
self._memory_size -= lru_region.size()
self._handle_count -= 1
# END while there is more memory to free
return num_found
def _obtain_region(self, a, offset, size, flags, is_recursive):
"""Utilty to create a new region - for more information on the parameters,
see MapCursor.use_region.
:param a: A regions (a)rray
:return: The newly created region"""
if self._memory_size + size > self._max_memory_size:
self._collect_lru_region(size)
# END handle collection
r = None
if a:
assert len(a) == 1
r = a[0]
else:
try:
r = self.MapRegionCls(a.path_or_fd(), 0, sys.maxsize, flags)
except Exception:
# apparently we are out of system resources or hit a limit
# As many more operations are likely to fail in that condition (
# like reading a file from disk, etc) we free up as much as possible
# As this invalidates our insert position, we have to recurse here
if is_recursive:
# we already tried this, and still have no success in obtaining
# a mapping. This is an exception, so we propagate it
raise
# END handle existing recursion
self._collect_lru_region(0)
return self._obtain_region(a, offset, size, flags, True)
# END handle exceptions
self._handle_count += 1
self._memory_size += r.size()
a.append(r)
# END handle array
assert r.includes_ofs(offset)
return r
#}END internal methods
#{ Interface
def make_cursor(self, path_or_fd):
"""
:return: a cursor pointing to the given path or file descriptor.
It can be used to map new regions of the file into memory
**Note:** if a file descriptor is given, it is assumed to be open and valid,
but may be closed afterwards. To refer to the same file, you may reuse
your existing file descriptor, but keep in mind that new windows can only
be mapped as long as it stays valid. This is why the using actual file paths
are preferred unless you plan to keep the file descriptor open.
**Note:** file descriptors are problematic as they are not necessarily unique, as two
different files opened and closed in succession might have the same file descriptor id.
**Note:** Using file descriptors directly is faster once new windows are mapped as it
prevents the file to be opened again just for the purpose of mapping it."""
regions = self._fdict.get(path_or_fd)
if regions is None:
regions = self.MapRegionListCls(path_or_fd)
self._fdict[path_or_fd] = regions
# END obtain region for path
return self.WindowCursorCls(self, regions)
def collect(self):
"""Collect all available free-to-collect mapped regions
:return: Amount of freed handles"""
return self._collect_lru_region(0)
def num_file_handles(self):
""":return: amount of file handles in use. Each mapped region uses one file handle"""
return self._handle_count
def num_open_files(self):
"""Amount of opened files in the system"""
return reduce(lambda x, y: x + y, (1 for rlist in self._fdict.values() if len(rlist) > 0), 0)
def window_size(self):
""":return: size of each window when allocating new regions"""
return self._window_size
def mapped_memory_size(self):
""":return: amount of bytes currently mapped in total"""
return self._memory_size
def max_file_handles(self):
""":return: maximium amount of handles we may have opened"""
return self._max_handle_count
def max_mapped_memory_size(self):
""":return: maximum amount of memory we may allocate"""
return self._max_memory_size
#} END interface
#{ Special Purpose Interface
def force_map_handle_removal_win(self, base_path):
"""ONLY AVAILABLE ON WINDOWS
On windows removing files is not allowed if anybody still has it opened.
If this process is ourselves, and if the whole process uses this memory
manager (as far as the parent framework is concerned) we can enforce
closing all memory maps whose path matches the given base path to
allow the respective operation after all.
The respective system must NOT access the closed memory regions anymore !
This really may only be used if you know that the items which keep
the cursors alive will not be using it anymore. They need to be recreated !
:return: Amount of closed handles
**Note:** does nothing on non-windows platforms"""
if sys.platform != 'win32':
return
# END early bailout
num_closed = 0
for path, rlist in self._fdict.items():
if path.startswith(base_path):
for region in rlist:
region.release()
num_closed += 1
# END path matches
# END for each path
return num_closed
#} END special purpose interface
class SlidingWindowMapManager(StaticWindowMapManager):
"""Maintains a list of ranges of mapped memory regions in one or more files and allows to easily
obtain additional regions assuring there is no overlap.
Once a certain memory limit is reached globally, or if there cannot be more open file handles
which result from each mmap call, the least recently used, and currently unused mapped regions
are unloaded automatically.
**Note:** currently not thread-safe !
**Note:** in the current implementation, we will automatically unload windows if we either cannot
create more memory maps (as the open file handles limit is hit) or if we have allocated more than
a safe amount of memory already, which would possibly cause memory allocations to fail as our address
space is full."""
__slots__ = tuple()
def __init__(self, window_size=-1, max_memory_size=0, max_open_handles=sys.maxsize):
"""Adjusts the default window size to -1"""
super().__init__(window_size, max_memory_size, max_open_handles)
def _obtain_region(self, a, offset, size, flags, is_recursive):
# bisect to find an existing region. The c++ implementation cannot
# do that as it uses a linked list for regions.
r = None
lo = 0
hi = len(a)
while lo < hi:
mid = (lo + hi) // 2
ofs = a[mid]._b
if ofs <= offset:
if a[mid].includes_ofs(offset):
r = a[mid]
break
# END have region
lo = mid + 1
else:
hi = mid
# END handle position
# END while bisecting
if r is None:
window_size = self._window_size
left = self.MapWindowCls(0, 0)
mid = self.MapWindowCls(offset, size)
right = self.MapWindowCls(a.file_size(), 0)
# we want to honor the max memory size, and assure we have anough
# memory available
# Save calls !
if self._memory_size + window_size > self._max_memory_size:
self._collect_lru_region(window_size)
# END handle collection
# we assume the list remains sorted by offset
insert_pos = 0
len_regions = len(a)
if len_regions == 1:
if a[0]._b <= offset:
insert_pos = 1
# END maintain sort
else:
# find insert position
insert_pos = len_regions
for i, region in enumerate(a):
if region._b > offset:
insert_pos = i
break
# END if insert position is correct
# END for each region
# END obtain insert pos
# adjust the actual offset and size values to create the largest
# possible mapping
if insert_pos == 0:
if len_regions:
right = self.MapWindowCls.from_region(a[insert_pos])
# END adjust right side
else:
if insert_pos != len_regions:
right = self.MapWindowCls.from_region(a[insert_pos])
# END adjust right window
left = self.MapWindowCls.from_region(a[insert_pos - 1])
# END adjust surrounding windows
mid.extend_left_to(left, window_size)
mid.extend_right_to(right, window_size)
mid.align()
# it can happen that we align beyond the end of the file
if mid.ofs_end() > right.ofs:
mid.size = right.ofs - mid.ofs
# END readjust size
# insert new region at the right offset to keep the order
try:
if self._handle_count >= self._max_handle_count:
raise Exception
# END assert own imposed max file handles
r = self.MapRegionCls(a.path_or_fd(), mid.ofs, mid.size, flags)
except Exception:
# apparently we are out of system resources or hit a limit
# As many more operations are likely to fail in that condition (
# like reading a file from disk, etc) we free up as much as possible
# As this invalidates our insert position, we have to recurse here
if is_recursive:
# we already tried this, and still have no success in obtaining
# a mapping. This is an exception, so we propagate it
raise
# END handle existing recursion
self._collect_lru_region(0)
return self._obtain_region(a, offset, size, flags, True)
# END handle exceptions
self._handle_count += 1
self._memory_size += r.size()
a.insert(insert_pos, r)
# END create new region
return r

View File

@@ -0,0 +1,72 @@
"""Provide base classes for the test system"""
from unittest import TestCase
import os
import tempfile
__all__ = ['TestBase', 'FileCreator']
#{ Utilities
class FileCreator:
"""A instance which creates a temporary file with a prefix and a given size
and provides this info to the user.
Once it gets deleted, it will remove the temporary file as well."""
__slots__ = ("_size", "_path")
def __init__(self, size, prefix=''):
assert size, "Require size to be larger 0"
self._path = tempfile.mktemp(prefix=prefix)
self._size = size
with open(self._path, "wb") as fp:
fp.seek(size - 1)
fp.write(b'1')
assert os.path.getsize(self.path) == size
def __del__(self):
try:
os.remove(self.path)
except OSError:
pass
# END exception handling
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.__del__()
@property
def path(self):
return self._path
@property
def size(self):
return self._size
#} END utilities
class TestBase(TestCase):
"""Foundation used by all tests"""
#{ Configuration
k_window_test_size = 1000 * 1000 * 8 + 5195
#} END configuration
#{ Overrides
@classmethod
def setUpAll(cls):
# nothing for now
pass
# END overrides
#{ Interface
#} END interface

View File

@@ -0,0 +1,126 @@
from .lib import TestBase, FileCreator
from smmap.mman import (
SlidingWindowMapManager,
StaticWindowMapManager
)
from smmap.buf import SlidingWindowMapBuffer
from random import randint
from time import time
import sys
import os
man_optimal = SlidingWindowMapManager()
man_worst_case = SlidingWindowMapManager(
window_size=TestBase.k_window_test_size // 100,
max_memory_size=TestBase.k_window_test_size // 3,
max_open_handles=15)
static_man = StaticWindowMapManager()
class TestBuf(TestBase):
def test_basics(self):
with FileCreator(self.k_window_test_size, "buffer_test") as fc:
# invalid paths fail upon construction
c = man_optimal.make_cursor(fc.path)
self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor
self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large
buf = SlidingWindowMapBuffer() # can create uninitailized buffers
assert buf.cursor() is None
# can call end access any time
buf.end_access()
buf.end_access()
assert len(buf) == 0
# begin access can revive it, if the offset is suitable
offset = 100
assert buf.begin_access(c, fc.size) == False
assert buf.begin_access(c, offset) == True
assert len(buf) == fc.size - offset
assert buf.cursor().is_valid()
# empty begin access keeps it valid on the same path, but alters the offset
assert buf.begin_access() == True
assert len(buf) == fc.size
assert buf.cursor().is_valid()
# simple access
with open(fc.path, 'rb') as fp:
data = fp.read()
assert data[offset] == buf[0]
assert data[offset:offset * 2] == buf[0:offset]
# negative indices, partial slices
assert buf[-1] == buf[len(buf) - 1]
assert buf[-10:] == buf[len(buf) - 10:len(buf)]
# end access makes its cursor invalid
buf.end_access()
assert not buf.cursor().is_valid()
assert buf.cursor().is_associated() # but it remains associated
# an empty begin access fixes it up again
assert buf.begin_access() == True and buf.cursor().is_valid()
del(buf) # ends access automatically
del(c)
assert man_optimal.num_file_handles() == 1
# PERFORMANCE
# blast away with random access and a full mapping - we don't want to
# exaggerate the manager's overhead, but measure the buffer overhead
# We do it once with an optimal setting, and with a worse manager which
# will produce small mappings only !
max_num_accesses = 100
fd = os.open(fc.path, os.O_RDONLY)
for item in (fc.path, fd):
for manager, man_id in ((man_optimal, 'optimal'),
(man_worst_case, 'worst case'),
(static_man, 'static optimal')):
buf = SlidingWindowMapBuffer(manager.make_cursor(item))
assert manager.num_file_handles() == 1
for access_mode in range(2): # single, multi
num_accesses_left = max_num_accesses
num_bytes = 0
fsize = fc.size
st = time()
buf.begin_access()
while num_accesses_left:
num_accesses_left -= 1
if access_mode: # multi
ofs_start = randint(0, fsize)
ofs_end = randint(ofs_start, fsize)
d = buf[ofs_start:ofs_end]
assert len(d) == ofs_end - ofs_start
assert d == data[ofs_start:ofs_end]
num_bytes += len(d)
del d
else:
pos = randint(0, fsize)
assert buf[pos] == data[pos]
num_bytes += 1
# END handle mode
# END handle num accesses
buf.end_access()
assert manager.num_file_handles()
assert manager.collect()
assert manager.num_file_handles() == 0
elapsed = max(time() - st, 0.001) # prevent zero division errors on windows
mb = float(1000 * 1000)
mode_str = (access_mode and "slice") or "single byte"
print("%s: Made %i random %s accesses to buffer created from %s reading a total of %f mb in %f s (%f mb/s)"
% (man_id, max_num_accesses, mode_str, type(item), num_bytes / mb, elapsed, (num_bytes / mb) / elapsed),
file=sys.stderr)
# END handle access mode
del buf
# END for each manager
# END for each input
os.close(fd)

View File

@@ -0,0 +1,224 @@
from .lib import TestBase, FileCreator
from smmap.mman import (
WindowCursor,
SlidingWindowMapManager,
StaticWindowMapManager
)
from smmap.util import align_to_mmap
from random import randint
from time import time
import os
import sys
from copy import copy
class TestMMan(TestBase):
def test_cursor(self):
with FileCreator(self.k_window_test_size, "cursor_test") as fc:
man = SlidingWindowMapManager()
ci = WindowCursor(man) # invalid cursor
assert not ci.is_valid()
assert not ci.is_associated()
assert ci.size() == 0 # this is cached, so we can query it in invalid state
cv = man.make_cursor(fc.path)
assert not cv.is_valid() # no region mapped yet
assert cv.is_associated() # but it know where to map it from
assert cv.file_size() == fc.size
assert cv.path() == fc.path
# copy module
cio = copy(cv)
assert not cio.is_valid() and cio.is_associated()
# assign method
assert not ci.is_associated()
ci.assign(cv)
assert not ci.is_valid() and ci.is_associated()
# unuse non-existing region is fine
cv.unuse_region()
cv.unuse_region()
# destruction is fine (even multiple times)
cv._destroy()
WindowCursor(man)._destroy()
def test_memory_manager(self):
slide_man = SlidingWindowMapManager()
static_man = StaticWindowMapManager()
for man in (static_man, slide_man):
assert man.num_file_handles() == 0
assert man.num_open_files() == 0
winsize_cmp_val = 0
if isinstance(man, StaticWindowMapManager):
winsize_cmp_val = -1
# END handle window size
assert man.window_size() > winsize_cmp_val
assert man.mapped_memory_size() == 0
assert man.max_mapped_memory_size() > 0
# collection doesn't raise in 'any' mode
man._collect_lru_region(0)
# doesn't raise if we are within the limit
man._collect_lru_region(10)
# doesn't fail if we over-allocate
assert man._collect_lru_region(sys.maxsize) == 0
# use a region, verify most basic functionality
with FileCreator(self.k_window_test_size, "manager_test") as fc:
fd = os.open(fc.path, os.O_RDONLY)
try:
for item in (fc.path, fd):
c = man.make_cursor(item)
assert c.path_or_fd() is item
assert c.use_region(10, 10).is_valid()
assert c.ofs_begin() == 10
assert c.size() == 10
with open(fc.path, 'rb') as fp:
assert c.buffer()[:] == fp.read(20)[10:]
if isinstance(item, int):
self.assertRaises(ValueError, c.path)
else:
self.assertRaises(ValueError, c.fd)
# END handle value error
# END for each input
finally:
os.close(fd)
# END for each manasger type
def test_memman_operation(self):
# test more access, force it to actually unmap regions
with FileCreator(self.k_window_test_size, "manager_operation_test") as fc:
with open(fc.path, 'rb') as fp:
data = fp.read()
fd = os.open(fc.path, os.O_RDONLY)
try:
max_num_handles = 15
# small_size =
for mtype, args in ((StaticWindowMapManager, (0, fc.size // 3, max_num_handles)),
(SlidingWindowMapManager, (fc.size // 100, fc.size // 3, max_num_handles)),):
for item in (fc.path, fd):
assert len(data) == fc.size
# small windows, a reasonable max memory. Not too many regions at once
man = mtype(window_size=args[0], max_memory_size=args[1], max_open_handles=args[2])
c = man.make_cursor(item)
# still empty (more about that is tested in test_memory_manager()
assert man.num_open_files() == 0
assert man.mapped_memory_size() == 0
base_offset = 5000
# window size is 0 for static managers, hence size will be 0. We take that into consideration
size = man.window_size() // 2
assert c.use_region(base_offset, size).is_valid()
rr = c.region()
assert rr.client_count() == 2 # the manager and the cursor and us
assert man.num_open_files() == 1
assert man.num_file_handles() == 1
assert man.mapped_memory_size() == rr.size()
# assert c.size() == size # the cursor may overallocate in its static version
assert c.ofs_begin() == base_offset
assert rr.ofs_begin() == 0 # it was aligned and expanded
if man.window_size():
# but isn't larger than the max window (aligned)
assert rr.size() == align_to_mmap(man.window_size(), True)
else:
assert rr.size() == fc.size
# END ignore static managers which dont use windows and are aligned to file boundaries
assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())]
# obtain second window, which spans the first part of the file - it is a still the same window
nsize = (size or fc.size) - 10
assert c.use_region(0, nsize).is_valid()
assert c.region() == rr
assert man.num_file_handles() == 1
assert c.size() == nsize
assert c.ofs_begin() == 0
assert c.buffer()[:] == data[:nsize]
# map some part at the end, our requested size cannot be kept
overshoot = 4000
base_offset = fc.size - (size or c.size()) + overshoot
assert c.use_region(base_offset, size).is_valid()
if man.window_size():
assert man.num_file_handles() == 2
assert c.size() < size
assert c.region() is not rr # old region is still available, but has not curser ref anymore
assert rr.client_count() == 1 # only held by manager
else:
assert c.size() < fc.size
# END ignore static managers which only have one handle per file
rr = c.region()
assert rr.client_count() == 2 # manager + cursor
assert rr.ofs_begin() < c.ofs_begin() # it should have extended itself to the left
assert rr.ofs_end() <= fc.size # it cannot be larger than the file
assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())]
# unising a region makes the cursor invalid
c.unuse_region()
assert not c.is_valid()
if man.window_size():
# but doesn't change anything regarding the handle count - we cache it and only
# remove mapped regions if we have to
assert man.num_file_handles() == 2
# END ignore this for static managers
# iterate through the windows, verify data contents
# this will trigger map collection after a while
max_random_accesses = 5000
num_random_accesses = max_random_accesses
memory_read = 0
st = time()
# cache everything to get some more performance
includes_ofs = c.includes_ofs
max_mapped_memory_size = man.max_mapped_memory_size()
max_file_handles = man.max_file_handles()
mapped_memory_size = man.mapped_memory_size
num_file_handles = man.num_file_handles
while num_random_accesses:
num_random_accesses -= 1
base_offset = randint(0, fc.size - 1)
# precondition
if man.window_size():
assert max_mapped_memory_size >= mapped_memory_size()
# END statics will overshoot, which is fine
assert max_file_handles >= num_file_handles()
assert c.use_region(base_offset, (size or c.size())).is_valid()
csize = c.size()
assert c.buffer()[:] == data[base_offset:base_offset + csize]
memory_read += csize
assert includes_ofs(base_offset)
assert includes_ofs(base_offset + csize - 1)
assert not includes_ofs(base_offset + csize)
# END while we should do an access
elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows
mb = float(1000 * 1000)
print("%s: Read %i mb of memory with %i random on cursor initialized with %s accesses in %fs (%f mb/s)\n"
% (mtype, memory_read / mb, max_random_accesses, type(item), elapsed, (memory_read / mb) / elapsed),
file=sys.stderr)
# an offset as large as the size doesn't work !
assert not c.use_region(fc.size, size).is_valid()
# collection - it should be able to collect all
assert man.num_file_handles()
assert man.collect()
assert man.num_file_handles() == 0
# END for each item
# END for each manager type
finally:
os.close(fd)

View File

@@ -0,0 +1,75 @@
from .lib import TestBase
class TestTutorial(TestBase):
def test_example(self):
# Memory Managers
##################
import smmap
# This instance should be globally available in your application
# It is configured to be well suitable for 32-bit or 64 bit applications.
mman = smmap.SlidingWindowMapManager()
# the manager provides much useful information about its current state
# like the amount of open file handles or the amount of mapped memory
assert mman.num_file_handles() == 0
assert mman.mapped_memory_size() == 0
# and many more ...
# Cursors
##########
import smmap.test.lib
with smmap.test.lib.FileCreator(1024 * 1024 * 8, "test_file") as fc:
# obtain a cursor to access some file.
c = mman.make_cursor(fc.path)
# the cursor is now associated with the file, but not yet usable
assert c.is_associated()
assert not c.is_valid()
# before you can use the cursor, you have to specify a window you want to
# access. The following just says you want as much data as possible starting
# from offset 0.
# To be sure your region could be mapped, query for validity
assert c.use_region().is_valid() # use_region returns self
# once a region was mapped, you must query its dimension regularly
# to assure you don't try to access its buffer out of its bounds
assert c.size()
c.buffer()[0] # first byte
c.buffer()[1:10] # first 9 bytes
c.buffer()[c.size() - 1] # last byte
# you can query absolute offsets, and check whether an offset is included
# in the cursor's data.
assert c.ofs_begin() < c.ofs_end()
assert c.includes_ofs(100)
# If you are over out of bounds with one of your region requests, the
# cursor will be come invalid. It cannot be used in that state
assert not c.use_region(fc.size, 100).is_valid()
# map as much as possible after skipping the first 100 bytes
assert c.use_region(100).is_valid()
# You can explicitly free cursor resources by unusing the cursor's region
c.unuse_region()
assert not c.is_valid()
# Buffers
#########
# Create a default buffer which can operate on the whole file
buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path))
# you can use it right away
assert buf.cursor().is_valid()
buf[0] # access the first byte
buf[-1] # access the last ten bytes on the file
buf[-10:] # access the last ten bytes
# If you want to keep the instance between different accesses, use the
# dedicated methods
buf.end_access()
assert not buf.cursor().is_valid() # you cannot use the buffer anymore
assert buf.begin_access(offset=10) # start using the buffer at an offset

View File

@@ -0,0 +1,105 @@
from .lib import TestBase, FileCreator
from smmap.util import (
MapWindow,
MapRegion,
MapRegionList,
ALLOCATIONGRANULARITY,
is_64_bit,
align_to_mmap
)
import os
import sys
class TestMMan(TestBase):
def test_window(self):
wl = MapWindow(0, 1) # left
wc = MapWindow(1, 1) # center
wc2 = MapWindow(10, 5) # another center
wr = MapWindow(8000, 50) # right
assert wl.ofs_end() == 1
assert wc.ofs_end() == 2
assert wr.ofs_end() == 8050
# extension does nothing if already in place
maxsize = 100
wc.extend_left_to(wl, maxsize)
assert wc.ofs == 1 and wc.size == 1
wl.extend_right_to(wc, maxsize)
wl.extend_right_to(wc, maxsize)
assert wl.ofs == 0 and wl.size == 1
# an actual left extension
pofs_end = wc2.ofs_end()
wc2.extend_left_to(wc, maxsize)
assert wc2.ofs == wc.ofs_end() and pofs_end == wc2.ofs_end()
# respects maxsize
wc.extend_right_to(wr, maxsize)
assert wc.ofs == 1 and wc.size == maxsize
wc.extend_right_to(wr, maxsize)
assert wc.ofs == 1 and wc.size == maxsize
# without maxsize
wc.extend_right_to(wr, sys.maxsize)
assert wc.ofs_end() == wr.ofs and wc.ofs == 1
# extend left
wr.extend_left_to(wc2, maxsize)
wr.extend_left_to(wc2, maxsize)
assert wr.size == maxsize
wr.extend_left_to(wc2, sys.maxsize)
assert wr.ofs == wc2.ofs_end()
wc.align()
assert wc.ofs == 0 and wc.size == align_to_mmap(wc.size, True)
def test_region(self):
with FileCreator(self.k_window_test_size, "window_test") as fc:
half_size = fc.size // 2
rofs = align_to_mmap(4200, False)
rfull = MapRegion(fc.path, 0, fc.size)
rhalfofs = MapRegion(fc.path, rofs, fc.size)
rhalfsize = MapRegion(fc.path, 0, half_size)
# offsets
assert rfull.ofs_begin() == 0 and rfull.size() == fc.size
assert rfull.ofs_end() == fc.size # if this method works, it works always
assert rhalfofs.ofs_begin() == rofs and rhalfofs.size() == fc.size - rofs
assert rhalfsize.ofs_begin() == 0 and rhalfsize.size() == half_size
assert rfull.includes_ofs(0) and rfull.includes_ofs(fc.size - 1) and rfull.includes_ofs(half_size)
assert not rfull.includes_ofs(-1) and not rfull.includes_ofs(sys.maxsize)
# auto-refcount
assert rfull.client_count() == 1
rfull2 = rfull
assert rfull.client_count() == 1, "no auto-counting"
# window constructor
w = MapWindow.from_region(rfull)
assert w.ofs == rfull.ofs_begin() and w.ofs_end() == rfull.ofs_end()
def test_region_list(self):
with FileCreator(100, "sample_file") as fc:
fd = os.open(fc.path, os.O_RDONLY)
try:
for item in (fc.path, fd):
ml = MapRegionList(item)
assert len(ml) == 0
assert ml.path_or_fd() == item
assert ml.file_size() == fc.size
finally:
os.close(fd)
def test_util(self):
assert isinstance(is_64_bit(), bool) # just call it
assert align_to_mmap(1, False) == 0
assert align_to_mmap(1, True) == ALLOCATIONGRANULARITY

View File

@@ -0,0 +1,222 @@
"""Module containing a memory memory manager which provides a sliding window on a number of memory mapped files"""
import os
import sys
from mmap import mmap, ACCESS_READ
from mmap import ALLOCATIONGRANULARITY
__all__ = ["align_to_mmap", "is_64_bit",
"MapWindow", "MapRegion", "MapRegionList", "ALLOCATIONGRANULARITY"]
#{ Utilities
def align_to_mmap(num, round_up):
"""
Align the given integer number to the closest page offset, which usually is 4096 bytes.
:param round_up: if True, the next higher multiple of page size is used, otherwise
the lower page_size will be used (i.e. if True, 1 becomes 4096, otherwise it becomes 0)
:return: num rounded to closest page"""
res = (num // ALLOCATIONGRANULARITY) * ALLOCATIONGRANULARITY
if round_up and (res != num):
res += ALLOCATIONGRANULARITY
# END handle size
return res
def is_64_bit():
""":return: True if the system is 64 bit. Otherwise it can be assumed to be 32 bit"""
return sys.maxsize > (1 << 32) - 1
#}END utilities
#{ Utility Classes
class MapWindow:
"""Utility type which is used to snap windows towards each other, and to adjust their size"""
__slots__ = (
'ofs', # offset into the file in bytes
'size' # size of the window in bytes
)
def __init__(self, offset, size):
self.ofs = offset
self.size = size
def __repr__(self):
return "MapWindow(%i, %i)" % (self.ofs, self.size)
@classmethod
def from_region(cls, region):
""":return: new window from a region"""
return cls(region._b, region.size())
def ofs_end(self):
return self.ofs + self.size
def align(self):
"""Assures the previous window area is contained in the new one"""
nofs = align_to_mmap(self.ofs, 0)
self.size += self.ofs - nofs # keep size constant
self.ofs = nofs
self.size = align_to_mmap(self.size, 1)
def extend_left_to(self, window, max_size):
"""Adjust the offset to start where the given window on our left ends if possible,
but don't make yourself larger than max_size.
The resize will assure that the new window still contains the old window area"""
rofs = self.ofs - window.ofs_end()
nsize = rofs + self.size
rofs -= nsize - min(nsize, max_size)
self.ofs = self.ofs - rofs
self.size += rofs
def extend_right_to(self, window, max_size):
"""Adjust the size to make our window end where the right window begins, but don't
get larger than max_size"""
self.size = min(self.size + (window.ofs - self.ofs_end()), max_size)
class MapRegion:
"""Defines a mapped region of memory, aligned to pagesizes
**Note:** deallocates used region automatically on destruction"""
__slots__ = [
'_b', # beginning of mapping
'_mf', # mapped memory chunk (as returned by mmap)
'_uc', # total amount of usages
'_size', # cached size of our memory map
'__weakref__'
]
#{ Configuration
#} END configuration
def __init__(self, path_or_fd, ofs, size, flags=0):
"""Initialize a region, allocate the memory map
:param path_or_fd: path to the file to map, or the opened file descriptor
:param ofs: **aligned** offset into the file to be mapped
:param size: if size is larger then the file on disk, the whole file will be
allocated the the size automatically adjusted
:param flags: additional flags to be given when opening the file.
:raise Exception: if no memory can be allocated"""
self._b = ofs
self._size = 0
self._uc = 0
if isinstance(path_or_fd, int):
fd = path_or_fd
else:
fd = os.open(path_or_fd, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags)
# END handle fd
try:
kwargs = dict(access=ACCESS_READ, offset=ofs)
corrected_size = size
sizeofs = ofs
# have to correct size, otherwise (instead of the c version) it will
# bark that the size is too large ... many extra file accesses because
# if this ... argh !
actual_size = min(os.fstat(fd).st_size - sizeofs, corrected_size)
self._mf = mmap(fd, actual_size, **kwargs)
# END handle memory mode
self._size = len(self._mf)
finally:
if isinstance(path_or_fd, str):
os.close(fd)
# END only close it if we opened it
# END close file handle
# We assume the first one to use us keeps us around
self.increment_client_count()
def __repr__(self):
return "MapRegion<%i, %i>" % (self._b, self.size())
#{ Interface
def buffer(self):
""":return: a buffer containing the memory"""
return self._mf
def map(self):
""":return: a memory map containing the memory"""
return self._mf
def ofs_begin(self):
""":return: absolute byte offset to the first byte of the mapping"""
return self._b
def size(self):
""":return: total size of the mapped region in bytes"""
return self._size
def ofs_end(self):
""":return: Absolute offset to one byte beyond the mapping into the file"""
return self._b + self._size
def includes_ofs(self, ofs):
""":return: True if the given offset can be read in our mapped region"""
return self._b <= ofs < self._b + self._size
def client_count(self):
""":return: number of clients currently using this region"""
return self._uc
def increment_client_count(self, ofs = 1):
"""Adjust the usage count by the given positive or negative offset.
If usage count equals 0, we will auto-release our resources
:return: True if we released resources, False otherwise. In the latter case, we can still be used"""
self._uc += ofs
assert self._uc > -1, "Increments must match decrements, usage counter negative: %i" % self._uc
if self.client_count() == 0:
self.release()
return True
else:
return False
# end handle release
def release(self):
"""Release all resources this instance might hold. Must only be called if there usage_count() is zero"""
self._mf.close()
#} END interface
class MapRegionList(list):
"""List of MapRegion instances associating a path with a list of regions."""
__slots__ = (
'_path_or_fd', # path or file descriptor which is mapped by all our regions
'_file_size' # total size of the file we map
)
def __new__(cls, path):
return super().__new__(cls)
def __init__(self, path_or_fd):
self._path_or_fd = path_or_fd
self._file_size = None
def path_or_fd(self):
""":return: path or file descriptor we are attached to"""
return self._path_or_fd
def file_size(self):
""":return: size of file we manager"""
if self._file_size is None:
if isinstance(self._path_or_fd, str):
self._file_size = os.stat(self._path_or_fd).st_size
else:
self._file_size = os.fstat(self._path_or_fd).st_size
# END handle path type
# END update file size
return self._file_size
#} END utility classes