mirror of
https://github.com/aykhans/AzSuicideDataVisualization.git
synced 2025-07-04 07:08:05 +00:00
first commit
This commit is contained in:
1
.venv/Lib/site-packages/pandas/io/sas/__init__.py
Normal file
1
.venv/Lib/site-packages/pandas/io/sas/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from pandas.io.sas.sasreader import read_sas # noqa:F401
|
BIN
.venv/Lib/site-packages/pandas/io/sas/_sas.cp310-win_amd64.pyd
Normal file
BIN
.venv/Lib/site-packages/pandas/io/sas/_sas.cp310-win_amd64.pyd
Normal file
Binary file not shown.
439
.venv/Lib/site-packages/pandas/io/sas/sas.pyx
Normal file
439
.venv/Lib/site-packages/pandas/io/sas/sas.pyx
Normal file
@ -0,0 +1,439 @@
|
||||
# cython: profile=False
|
||||
# cython: boundscheck=False, initializedcheck=False
|
||||
from cython import Py_ssize_t
|
||||
import numpy as np
|
||||
|
||||
import pandas.io.sas.sas_constants as const
|
||||
|
||||
ctypedef signed long long int64_t
|
||||
ctypedef unsigned char uint8_t
|
||||
ctypedef unsigned short uint16_t
|
||||
|
||||
# rle_decompress decompresses data using a Run Length Encoding
|
||||
# algorithm. It is partially documented here:
|
||||
#
|
||||
# https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
|
||||
cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff):
|
||||
|
||||
cdef:
|
||||
uint8_t control_byte, x
|
||||
uint8_t[:] result = np.zeros(result_length, np.uint8)
|
||||
int rpos = 0
|
||||
int i, nbytes, end_of_first_byte
|
||||
Py_ssize_t ipos = 0, length = len(inbuff)
|
||||
|
||||
while ipos < length:
|
||||
control_byte = inbuff[ipos] & 0xF0
|
||||
end_of_first_byte = <int>(inbuff[ipos] & 0x0F)
|
||||
ipos += 1
|
||||
|
||||
if control_byte == 0x00:
|
||||
if end_of_first_byte != 0:
|
||||
raise ValueError("Unexpected non-zero end_of_first_byte")
|
||||
nbytes = <int>(inbuff[ipos]) + 64
|
||||
ipos += 1
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = inbuff[ipos]
|
||||
rpos += 1
|
||||
ipos += 1
|
||||
elif control_byte == 0x40:
|
||||
# not documented
|
||||
nbytes = end_of_first_byte * 16
|
||||
nbytes += <int>(inbuff[ipos])
|
||||
ipos += 1
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = inbuff[ipos]
|
||||
rpos += 1
|
||||
ipos += 1
|
||||
elif control_byte == 0x60:
|
||||
nbytes = end_of_first_byte * 256 + <int>(inbuff[ipos]) + 17
|
||||
ipos += 1
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = 0x20
|
||||
rpos += 1
|
||||
elif control_byte == 0x70:
|
||||
nbytes = end_of_first_byte * 256 + <int>(inbuff[ipos]) + 17
|
||||
ipos += 1
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = 0x00
|
||||
rpos += 1
|
||||
elif control_byte == 0x80:
|
||||
nbytes = end_of_first_byte + 1
|
||||
for i in range(nbytes):
|
||||
result[rpos] = inbuff[ipos + i]
|
||||
rpos += 1
|
||||
ipos += nbytes
|
||||
elif control_byte == 0x90:
|
||||
nbytes = end_of_first_byte + 17
|
||||
for i in range(nbytes):
|
||||
result[rpos] = inbuff[ipos + i]
|
||||
rpos += 1
|
||||
ipos += nbytes
|
||||
elif control_byte == 0xA0:
|
||||
nbytes = end_of_first_byte + 33
|
||||
for i in range(nbytes):
|
||||
result[rpos] = inbuff[ipos + i]
|
||||
rpos += 1
|
||||
ipos += nbytes
|
||||
elif control_byte == 0xB0:
|
||||
nbytes = end_of_first_byte + 49
|
||||
for i in range(nbytes):
|
||||
result[rpos] = inbuff[ipos + i]
|
||||
rpos += 1
|
||||
ipos += nbytes
|
||||
elif control_byte == 0xC0:
|
||||
nbytes = end_of_first_byte + 3
|
||||
x = inbuff[ipos]
|
||||
ipos += 1
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = x
|
||||
rpos += 1
|
||||
elif control_byte == 0xD0:
|
||||
nbytes = end_of_first_byte + 2
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = 0x40
|
||||
rpos += 1
|
||||
elif control_byte == 0xE0:
|
||||
nbytes = end_of_first_byte + 2
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = 0x20
|
||||
rpos += 1
|
||||
elif control_byte == 0xF0:
|
||||
nbytes = end_of_first_byte + 2
|
||||
for _ in range(nbytes):
|
||||
result[rpos] = 0x00
|
||||
rpos += 1
|
||||
else:
|
||||
raise ValueError(f"unknown control byte: {control_byte}")
|
||||
|
||||
# In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
|
||||
if <Py_ssize_t>len(result) != <Py_ssize_t>result_length:
|
||||
raise ValueError(f"RLE: {len(result)} != {result_length}")
|
||||
|
||||
return np.asarray(result)
|
||||
|
||||
|
||||
# rdc_decompress decompresses data using the Ross Data Compression algorithm:
|
||||
#
|
||||
# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff):
|
||||
|
||||
cdef:
|
||||
uint8_t cmd
|
||||
uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt
|
||||
int rpos = 0, k
|
||||
uint8_t[:] outbuff = np.zeros(result_length, dtype=np.uint8)
|
||||
Py_ssize_t ipos = 0, length = len(inbuff)
|
||||
|
||||
ii = -1
|
||||
|
||||
while ipos < length:
|
||||
ii += 1
|
||||
ctrl_mask = ctrl_mask >> 1
|
||||
if ctrl_mask == 0:
|
||||
ctrl_bits = ((<uint16_t>inbuff[ipos] << 8) +
|
||||
<uint16_t>inbuff[ipos + 1])
|
||||
ipos += 2
|
||||
ctrl_mask = 0x8000
|
||||
|
||||
if ctrl_bits & ctrl_mask == 0:
|
||||
outbuff[rpos] = inbuff[ipos]
|
||||
ipos += 1
|
||||
rpos += 1
|
||||
continue
|
||||
|
||||
cmd = (inbuff[ipos] >> 4) & 0x0F
|
||||
cnt = <uint16_t>(inbuff[ipos] & 0x0F)
|
||||
ipos += 1
|
||||
|
||||
# short RLE
|
||||
if cmd == 0:
|
||||
cnt += 3
|
||||
for k in range(cnt):
|
||||
outbuff[rpos + k] = inbuff[ipos]
|
||||
rpos += cnt
|
||||
ipos += 1
|
||||
|
||||
# long RLE
|
||||
elif cmd == 1:
|
||||
cnt += <uint16_t>inbuff[ipos] << 4
|
||||
cnt += 19
|
||||
ipos += 1
|
||||
for k in range(cnt):
|
||||
outbuff[rpos + k] = inbuff[ipos]
|
||||
rpos += cnt
|
||||
ipos += 1
|
||||
|
||||
# long pattern
|
||||
elif cmd == 2:
|
||||
ofs = cnt + 3
|
||||
ofs += <uint16_t>inbuff[ipos] << 4
|
||||
ipos += 1
|
||||
cnt = <uint16_t>inbuff[ipos]
|
||||
ipos += 1
|
||||
cnt += 16
|
||||
for k in range(cnt):
|
||||
outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
|
||||
rpos += cnt
|
||||
|
||||
# short pattern
|
||||
elif (cmd >= 3) & (cmd <= 15):
|
||||
ofs = cnt + 3
|
||||
ofs += <uint16_t>inbuff[ipos] << 4
|
||||
ipos += 1
|
||||
for k in range(cmd):
|
||||
outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
|
||||
rpos += cmd
|
||||
|
||||
else:
|
||||
raise ValueError("unknown RDC command")
|
||||
|
||||
# In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
|
||||
if <Py_ssize_t>len(outbuff) != <Py_ssize_t>result_length:
|
||||
raise ValueError(f"RDC: {len(outbuff)} != {result_length}\n")
|
||||
|
||||
return np.asarray(outbuff)
|
||||
|
||||
|
||||
cdef enum ColumnTypes:
|
||||
column_type_decimal = 1
|
||||
column_type_string = 2
|
||||
|
||||
|
||||
# type the page_data types
|
||||
cdef:
|
||||
int page_meta_type = const.page_meta_type
|
||||
int page_mix_types_0 = const.page_mix_types[0]
|
||||
int page_mix_types_1 = const.page_mix_types[1]
|
||||
int page_data_type = const.page_data_type
|
||||
int subheader_pointers_offset = const.subheader_pointers_offset
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
|
||||
cdef:
|
||||
int column_count
|
||||
int64_t[:] lengths
|
||||
int64_t[:] offsets
|
||||
int64_t[:] column_types
|
||||
uint8_t[:, :] byte_chunk
|
||||
object[:, :] string_chunk
|
||||
char *cached_page
|
||||
int current_row_on_page_index
|
||||
int current_page_block_count
|
||||
int current_page_data_subheader_pointers_len
|
||||
int current_page_subheaders_count
|
||||
int current_row_in_chunk_index
|
||||
int current_row_in_file_index
|
||||
int header_length
|
||||
int row_length
|
||||
int bit_offset
|
||||
int subheader_pointer_length
|
||||
int current_page_type
|
||||
bint is_little_endian
|
||||
const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff)
|
||||
object parser
|
||||
|
||||
def __init__(self, object parser):
|
||||
cdef:
|
||||
int j
|
||||
char[:] column_types
|
||||
|
||||
self.parser = parser
|
||||
self.header_length = self.parser.header_length
|
||||
self.column_count = parser.column_count
|
||||
self.lengths = parser.column_data_lengths()
|
||||
self.offsets = parser.column_data_offsets()
|
||||
self.byte_chunk = parser._byte_chunk
|
||||
self.string_chunk = parser._string_chunk
|
||||
self.row_length = parser.row_length
|
||||
self.bit_offset = self.parser._page_bit_offset
|
||||
self.subheader_pointer_length = self.parser._subheader_pointer_length
|
||||
self.is_little_endian = parser.byte_order == "<"
|
||||
self.column_types = np.empty(self.column_count, dtype='int64')
|
||||
|
||||
# page indicators
|
||||
self.update_next_page()
|
||||
|
||||
column_types = parser.column_types()
|
||||
|
||||
# map column types
|
||||
for j in range(self.column_count):
|
||||
if column_types[j] == b'd':
|
||||
self.column_types[j] = column_type_decimal
|
||||
elif column_types[j] == b's':
|
||||
self.column_types[j] = column_type_string
|
||||
else:
|
||||
raise ValueError(f"unknown column type: {self.parser.columns[j].ctype}")
|
||||
|
||||
# compression
|
||||
if parser.compression == const.rle_compression:
|
||||
self.decompress = rle_decompress
|
||||
elif parser.compression == const.rdc_compression:
|
||||
self.decompress = rdc_decompress
|
||||
else:
|
||||
self.decompress = NULL
|
||||
|
||||
# update to current state of the parser
|
||||
self.current_row_in_chunk_index = parser._current_row_in_chunk_index
|
||||
self.current_row_in_file_index = parser._current_row_in_file_index
|
||||
self.current_row_on_page_index = parser._current_row_on_page_index
|
||||
|
||||
def read(self, int nrows):
|
||||
cdef:
|
||||
bint done
|
||||
int i
|
||||
|
||||
for _ in range(nrows):
|
||||
done = self.readline()
|
||||
if done:
|
||||
break
|
||||
|
||||
# update the parser
|
||||
self.parser._current_row_on_page_index = self.current_row_on_page_index
|
||||
self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index
|
||||
self.parser._current_row_in_file_index = self.current_row_in_file_index
|
||||
|
||||
cdef bint read_next_page(self):
|
||||
cdef done
|
||||
|
||||
done = self.parser._read_next_page()
|
||||
if done:
|
||||
self.cached_page = NULL
|
||||
else:
|
||||
self.update_next_page()
|
||||
return done
|
||||
|
||||
cdef update_next_page(self):
|
||||
# update data for the current page
|
||||
|
||||
self.cached_page = <char *>self.parser._cached_page
|
||||
self.current_row_on_page_index = 0
|
||||
self.current_page_type = self.parser._current_page_type
|
||||
self.current_page_block_count = self.parser._current_page_block_count
|
||||
self.current_page_data_subheader_pointers_len = len(
|
||||
self.parser._current_page_data_subheader_pointers
|
||||
)
|
||||
self.current_page_subheaders_count = self.parser._current_page_subheaders_count
|
||||
|
||||
cdef readline(self):
|
||||
|
||||
cdef:
|
||||
int offset, bit_offset, align_correction
|
||||
int subheader_pointer_length, mn
|
||||
bint done, flag
|
||||
|
||||
bit_offset = self.bit_offset
|
||||
subheader_pointer_length = self.subheader_pointer_length
|
||||
|
||||
# If there is no page, go to the end of the header and read a page.
|
||||
if self.cached_page == NULL:
|
||||
self.parser._path_or_buf.seek(self.header_length)
|
||||
done = self.read_next_page()
|
||||
if done:
|
||||
return True
|
||||
|
||||
# Loop until a data row is read
|
||||
while True:
|
||||
if self.current_page_type == page_meta_type:
|
||||
flag = self.current_row_on_page_index >=\
|
||||
self.current_page_data_subheader_pointers_len
|
||||
if flag:
|
||||
done = self.read_next_page()
|
||||
if done:
|
||||
return True
|
||||
continue
|
||||
current_subheader_pointer = (
|
||||
self.parser._current_page_data_subheader_pointers[
|
||||
self.current_row_on_page_index])
|
||||
self.process_byte_array_with_data(
|
||||
current_subheader_pointer.offset,
|
||||
current_subheader_pointer.length)
|
||||
return False
|
||||
elif (self.current_page_type == page_mix_types_0 or
|
||||
self.current_page_type == page_mix_types_1):
|
||||
align_correction = (
|
||||
bit_offset
|
||||
+ subheader_pointers_offset
|
||||
+ self.current_page_subheaders_count * subheader_pointer_length
|
||||
)
|
||||
align_correction = align_correction % 8
|
||||
offset = bit_offset + align_correction
|
||||
offset += subheader_pointers_offset
|
||||
offset += self.current_page_subheaders_count * subheader_pointer_length
|
||||
offset += self.current_row_on_page_index * self.row_length
|
||||
self.process_byte_array_with_data(offset, self.row_length)
|
||||
mn = min(self.parser.row_count, self.parser._mix_page_row_count)
|
||||
if self.current_row_on_page_index == mn:
|
||||
done = self.read_next_page()
|
||||
if done:
|
||||
return True
|
||||
return False
|
||||
elif self.current_page_type & page_data_type == page_data_type:
|
||||
self.process_byte_array_with_data(
|
||||
bit_offset
|
||||
+ subheader_pointers_offset
|
||||
+ self.current_row_on_page_index * self.row_length,
|
||||
self.row_length,
|
||||
)
|
||||
flag = self.current_row_on_page_index == self.current_page_block_count
|
||||
if flag:
|
||||
done = self.read_next_page()
|
||||
if done:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f"unknown page type: {self.current_page_type}")
|
||||
|
||||
cdef void process_byte_array_with_data(self, int offset, int length):
|
||||
|
||||
cdef:
|
||||
Py_ssize_t j
|
||||
int s, k, m, jb, js, current_row
|
||||
int64_t lngt, start, ct
|
||||
const uint8_t[:] source
|
||||
int64_t[:] column_types
|
||||
int64_t[:] lengths
|
||||
int64_t[:] offsets
|
||||
uint8_t[:, :] byte_chunk
|
||||
object[:, :] string_chunk
|
||||
|
||||
source = np.frombuffer(
|
||||
self.cached_page[offset:offset + length], dtype=np.uint8)
|
||||
|
||||
if self.decompress != NULL and (length < self.row_length):
|
||||
source = self.decompress(self.row_length, source)
|
||||
|
||||
current_row = self.current_row_in_chunk_index
|
||||
column_types = self.column_types
|
||||
lengths = self.lengths
|
||||
offsets = self.offsets
|
||||
byte_chunk = self.byte_chunk
|
||||
string_chunk = self.string_chunk
|
||||
s = 8 * self.current_row_in_chunk_index
|
||||
js = 0
|
||||
jb = 0
|
||||
for j in range(self.column_count):
|
||||
lngt = lengths[j]
|
||||
if lngt == 0:
|
||||
break
|
||||
start = offsets[j]
|
||||
ct = column_types[j]
|
||||
if ct == column_type_decimal:
|
||||
# decimal
|
||||
if self.is_little_endian:
|
||||
m = s + 8 - lngt
|
||||
else:
|
||||
m = s
|
||||
for k in range(lngt):
|
||||
byte_chunk[jb, m + k] = source[start + k]
|
||||
jb += 1
|
||||
elif column_types[j] == column_type_string:
|
||||
# string
|
||||
string_chunk[js, current_row] = np.array(source[start:(
|
||||
start + lngt)]).tobytes().rstrip(b"\x00 ")
|
||||
js += 1
|
||||
|
||||
self.current_row_on_page_index += 1
|
||||
self.current_row_in_chunk_index += 1
|
||||
self.current_row_in_file_index += 1
|
824
.venv/Lib/site-packages/pandas/io/sas/sas7bdat.py
Normal file
824
.venv/Lib/site-packages/pandas/io/sas/sas7bdat.py
Normal file
@ -0,0 +1,824 @@
|
||||
"""
|
||||
Read SAS7BDAT files
|
||||
|
||||
Based on code written by Jared Hobbs:
|
||||
https://bitbucket.org/jaredhobbs/sas7bdat
|
||||
|
||||
See also:
|
||||
https://github.com/BioStatMatt/sas7bdat
|
||||
|
||||
Partial documentation of the file format:
|
||||
https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
|
||||
|
||||
Reference for binary data compression:
|
||||
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
import struct
|
||||
from typing import cast
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
from pandas.errors import (
|
||||
EmptyDataError,
|
||||
OutOfBoundsDatetime,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
isna,
|
||||
)
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
from pandas.io.sas._sas import Parser
|
||||
import pandas.io.sas.sas_constants as const
|
||||
from pandas.io.sas.sasreader import ReaderBase
|
||||
|
||||
|
||||
def _parse_datetime(sas_datetime: float, unit: str):
|
||||
if isna(sas_datetime):
|
||||
return pd.NaT
|
||||
|
||||
if unit == "s":
|
||||
return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
|
||||
|
||||
elif unit == "d":
|
||||
return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
|
||||
|
||||
else:
|
||||
raise ValueError("unit must be 'd' or 's'")
|
||||
|
||||
|
||||
def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
|
||||
"""
|
||||
Convert to Timestamp if possible, otherwise to datetime.datetime.
|
||||
SAS float64 lacks precision for more than ms resolution so the fit
|
||||
to datetime.datetime is ok.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sas_datetimes : {Series, Sequence[float]}
|
||||
Dates or datetimes in SAS
|
||||
unit : {str}
|
||||
"d" if the floats represent dates, "s" for datetimes
|
||||
|
||||
Returns
|
||||
-------
|
||||
Series
|
||||
Series of datetime64 dtype or datetime.datetime.
|
||||
"""
|
||||
try:
|
||||
return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
|
||||
except OutOfBoundsDatetime:
|
||||
s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
|
||||
s_series = cast(pd.Series, s_series)
|
||||
return s_series
|
||||
|
||||
|
||||
class _SubheaderPointer:
|
||||
offset: int
|
||||
length: int
|
||||
compression: int
|
||||
ptype: int
|
||||
|
||||
def __init__(self, offset: int, length: int, compression: int, ptype: int):
|
||||
self.offset = offset
|
||||
self.length = length
|
||||
self.compression = compression
|
||||
self.ptype = ptype
|
||||
|
||||
|
||||
class _Column:
|
||||
col_id: int
|
||||
name: str | bytes
|
||||
label: str | bytes
|
||||
format: str | bytes
|
||||
ctype: bytes
|
||||
length: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
col_id: int,
|
||||
# These can be bytes when convert_header_text is False
|
||||
name: str | bytes,
|
||||
label: str | bytes,
|
||||
format: str | bytes,
|
||||
ctype: bytes,
|
||||
length: int,
|
||||
):
|
||||
self.col_id = col_id
|
||||
self.name = name
|
||||
self.label = label
|
||||
self.format = format
|
||||
self.ctype = ctype
|
||||
self.length = length
|
||||
|
||||
|
||||
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
|
||||
class SAS7BDATReader(ReaderBase, abc.Iterator):
|
||||
"""
|
||||
Read SAS files in SAS7BDAT format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_buf : path name or buffer
|
||||
Name of SAS file or file-like object pointing to SAS file
|
||||
contents.
|
||||
index : column identifier, defaults to None
|
||||
Column to use as index.
|
||||
convert_dates : bool, defaults to True
|
||||
Attempt to convert dates to Pandas datetime values. Note that
|
||||
some rarely used SAS date formats may be unsupported.
|
||||
blank_missing : bool, defaults to True
|
||||
Convert empty strings to missing values (SAS uses blanks to
|
||||
indicate missing character variables).
|
||||
chunksize : int, defaults to None
|
||||
Return SAS7BDATReader object for iterations, returns chunks
|
||||
with given number of lines.
|
||||
encoding : string, defaults to None
|
||||
String encoding.
|
||||
convert_text : bool, defaults to True
|
||||
If False, text variables are left as raw bytes.
|
||||
convert_header_text : bool, defaults to True
|
||||
If False, header text, including column names, are left as raw
|
||||
bytes.
|
||||
"""
|
||||
|
||||
_int_length: int
|
||||
_cached_page: bytes | None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_or_buf: FilePath | ReadBuffer[bytes],
|
||||
index=None,
|
||||
convert_dates=True,
|
||||
blank_missing=True,
|
||||
chunksize=None,
|
||||
encoding=None,
|
||||
convert_text=True,
|
||||
convert_header_text=True,
|
||||
):
|
||||
|
||||
self.index = index
|
||||
self.convert_dates = convert_dates
|
||||
self.blank_missing = blank_missing
|
||||
self.chunksize = chunksize
|
||||
self.encoding = encoding
|
||||
self.convert_text = convert_text
|
||||
self.convert_header_text = convert_header_text
|
||||
|
||||
self.default_encoding = "latin-1"
|
||||
self.compression = b""
|
||||
self.column_names_strings: list[str] = []
|
||||
self.column_names: list[str] = []
|
||||
self.column_formats: list[str] = []
|
||||
self.columns: list[_Column] = []
|
||||
|
||||
self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
|
||||
self._cached_page = None
|
||||
self._column_data_lengths: list[int] = []
|
||||
self._column_data_offsets: list[int] = []
|
||||
self._column_types: list[bytes] = []
|
||||
|
||||
self._current_row_in_file_index = 0
|
||||
self._current_row_on_page_index = 0
|
||||
self._current_row_in_file_index = 0
|
||||
|
||||
self.handles = get_handle(path_or_buf, "rb", is_text=False)
|
||||
|
||||
self._path_or_buf = self.handles.handle
|
||||
|
||||
try:
|
||||
self._get_properties()
|
||||
self._parse_metadata()
|
||||
except Exception:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
def column_data_lengths(self) -> np.ndarray:
|
||||
"""Return a numpy int64 array of the column data lengths"""
|
||||
return np.asarray(self._column_data_lengths, dtype=np.int64)
|
||||
|
||||
def column_data_offsets(self) -> np.ndarray:
|
||||
"""Return a numpy int64 array of the column offsets"""
|
||||
return np.asarray(self._column_data_offsets, dtype=np.int64)
|
||||
|
||||
def column_types(self) -> np.ndarray:
|
||||
"""
|
||||
Returns a numpy character array of the column types:
|
||||
s (string) or d (double)
|
||||
"""
|
||||
return np.asarray(self._column_types, dtype=np.dtype("S1"))
|
||||
|
||||
def close(self) -> None:
|
||||
self.handles.close()
|
||||
|
||||
def _get_properties(self) -> None:
|
||||
|
||||
# Check magic number
|
||||
self._path_or_buf.seek(0)
|
||||
self._cached_page = self._path_or_buf.read(288)
|
||||
if self._cached_page[0 : len(const.magic)] != const.magic:
|
||||
raise ValueError("magic number mismatch (not a SAS file?)")
|
||||
|
||||
# Get alignment information
|
||||
align1, align2 = 0, 0
|
||||
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
|
||||
if buf == const.u64_byte_checker_value:
|
||||
align2 = const.align_2_value
|
||||
self.U64 = True
|
||||
self._int_length = 8
|
||||
self._page_bit_offset = const.page_bit_offset_x64
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x64
|
||||
else:
|
||||
self.U64 = False
|
||||
self._page_bit_offset = const.page_bit_offset_x86
|
||||
self._subheader_pointer_length = const.subheader_pointer_length_x86
|
||||
self._int_length = 4
|
||||
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
|
||||
if buf == const.align_1_checker_value:
|
||||
align1 = const.align_2_value
|
||||
total_align = align1 + align2
|
||||
|
||||
# Get endianness information
|
||||
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
|
||||
if buf == b"\x01":
|
||||
self.byte_order = "<"
|
||||
else:
|
||||
self.byte_order = ">"
|
||||
|
||||
# Get encoding information
|
||||
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
|
||||
if buf in const.encoding_names:
|
||||
self.file_encoding = const.encoding_names[buf]
|
||||
else:
|
||||
self.file_encoding = f"unknown (code={buf})"
|
||||
|
||||
# Get platform information
|
||||
buf = self._read_bytes(const.platform_offset, const.platform_length)
|
||||
if buf == b"1":
|
||||
self.platform = "unix"
|
||||
elif buf == b"2":
|
||||
self.platform = "windows"
|
||||
else:
|
||||
self.platform = "unknown"
|
||||
|
||||
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
|
||||
self.name = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.name = self.name.decode(self.encoding or self.default_encoding)
|
||||
|
||||
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
|
||||
self.file_type = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.file_type = self.file_type.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
# Timestamp is epoch 01/01/1960
|
||||
epoch = datetime(1960, 1, 1)
|
||||
x = self._read_float(
|
||||
const.date_created_offset + align1, const.date_created_length
|
||||
)
|
||||
self.date_created = epoch + pd.to_timedelta(x, unit="s")
|
||||
x = self._read_float(
|
||||
const.date_modified_offset + align1, const.date_modified_length
|
||||
)
|
||||
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
|
||||
|
||||
self.header_length = self._read_int(
|
||||
const.header_size_offset + align1, const.header_size_length
|
||||
)
|
||||
|
||||
# Read the rest of the header into cached_page.
|
||||
buf = self._path_or_buf.read(self.header_length - 288)
|
||||
self._cached_page += buf
|
||||
# error: Argument 1 to "len" has incompatible type "Optional[bytes]";
|
||||
# expected "Sized"
|
||||
if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
|
||||
raise ValueError("The SAS7BDAT file appears to be truncated.")
|
||||
|
||||
self._page_length = self._read_int(
|
||||
const.page_size_offset + align1, const.page_size_length
|
||||
)
|
||||
self._page_count = self._read_int(
|
||||
const.page_count_offset + align1, const.page_count_length
|
||||
)
|
||||
|
||||
buf = self._read_bytes(
|
||||
const.sas_release_offset + total_align, const.sas_release_length
|
||||
)
|
||||
self.sas_release = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.sas_release = self.sas_release.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
buf = self._read_bytes(
|
||||
const.sas_server_type_offset + total_align, const.sas_server_type_length
|
||||
)
|
||||
self.server_type = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.server_type = self.server_type.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
buf = self._read_bytes(
|
||||
const.os_version_number_offset + total_align, const.os_version_number_length
|
||||
)
|
||||
self.os_version = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.os_version = self.os_version.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
|
||||
buf = buf.rstrip(b"\x00 ")
|
||||
if len(buf) > 0:
|
||||
self.os_name = buf.decode(self.encoding or self.default_encoding)
|
||||
else:
|
||||
buf = self._read_bytes(
|
||||
const.os_maker_offset + total_align, const.os_maker_length
|
||||
)
|
||||
self.os_name = buf.rstrip(b"\x00 ")
|
||||
if self.convert_header_text:
|
||||
self.os_name = self.os_name.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
def __next__(self):
|
||||
da = self.read(nrows=self.chunksize or 1)
|
||||
if da is None:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
return da
|
||||
|
||||
# Read a single float of the given width (4 or 8).
|
||||
def _read_float(self, offset: int, width: int):
|
||||
if width not in (4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid float width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
fd = "f" if width == 4 else "d"
|
||||
return struct.unpack(self.byte_order + fd, buf)[0]
|
||||
|
||||
# Read a single signed integer of the given width (1, 2, 4 or 8).
|
||||
def _read_int(self, offset: int, width: int) -> int:
|
||||
if width not in (1, 2, 4, 8):
|
||||
self.close()
|
||||
raise ValueError("invalid int width")
|
||||
buf = self._read_bytes(offset, width)
|
||||
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
|
||||
iv = struct.unpack(self.byte_order + it, buf)[0]
|
||||
return iv
|
||||
|
||||
def _read_bytes(self, offset: int, length: int):
|
||||
if self._cached_page is None:
|
||||
self._path_or_buf.seek(offset)
|
||||
buf = self._path_or_buf.read(length)
|
||||
if len(buf) < length:
|
||||
self.close()
|
||||
msg = f"Unable to read {length:d} bytes from file position {offset:d}."
|
||||
raise ValueError(msg)
|
||||
return buf
|
||||
else:
|
||||
if offset + length > len(self._cached_page):
|
||||
self.close()
|
||||
raise ValueError("The cached page is too small.")
|
||||
return self._cached_page[offset : offset + length]
|
||||
|
||||
def _parse_metadata(self) -> None:
|
||||
done = False
|
||||
while not done:
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
break
|
||||
if len(self._cached_page) != self._page_length:
|
||||
raise ValueError("Failed to read a meta data page from the SAS file.")
|
||||
done = self._process_page_meta()
|
||||
|
||||
def _process_page_meta(self) -> bool:
|
||||
self._read_page_header()
|
||||
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
|
||||
if self._current_page_type in pt:
|
||||
self._process_page_metadata()
|
||||
is_data_page = self._current_page_type & const.page_data_type
|
||||
is_mix_page = self._current_page_type in const.page_mix_types
|
||||
return bool(
|
||||
is_data_page
|
||||
or is_mix_page
|
||||
or self._current_page_data_subheader_pointers != []
|
||||
)
|
||||
|
||||
def _read_page_header(self):
|
||||
bit_offset = self._page_bit_offset
|
||||
tx = const.page_type_offset + bit_offset
|
||||
self._current_page_type = self._read_int(tx, const.page_type_length)
|
||||
tx = const.block_count_offset + bit_offset
|
||||
self._current_page_block_count = self._read_int(tx, const.block_count_length)
|
||||
tx = const.subheader_count_offset + bit_offset
|
||||
self._current_page_subheaders_count = self._read_int(
|
||||
tx, const.subheader_count_length
|
||||
)
|
||||
|
||||
def _process_page_metadata(self) -> None:
|
||||
bit_offset = self._page_bit_offset
|
||||
|
||||
for i in range(self._current_page_subheaders_count):
|
||||
pointer = self._process_subheader_pointers(
|
||||
const.subheader_pointers_offset + bit_offset, i
|
||||
)
|
||||
if pointer.length == 0:
|
||||
continue
|
||||
if pointer.compression == const.truncated_subheader_id:
|
||||
continue
|
||||
subheader_signature = self._read_subheader_signature(pointer.offset)
|
||||
subheader_index = self._get_subheader_index(
|
||||
subheader_signature, pointer.compression, pointer.ptype
|
||||
)
|
||||
self._process_subheader(subheader_index, pointer)
|
||||
|
||||
def _get_subheader_index(self, signature: bytes, compression, ptype) -> int:
|
||||
# TODO: return here could be made an enum
|
||||
index = const.subheader_signature_to_index.get(signature)
|
||||
if index is None:
|
||||
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
|
||||
f2 = ptype == const.compressed_subheader_type
|
||||
if (self.compression != b"") and f1 and f2:
|
||||
index = const.SASIndex.data_subheader_index
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError("Unknown subheader signature")
|
||||
return index
|
||||
|
||||
def _process_subheader_pointers(
|
||||
self, offset: int, subheader_pointer_index: int
|
||||
) -> _SubheaderPointer:
|
||||
|
||||
subheader_pointer_length = self._subheader_pointer_length
|
||||
total_offset = offset + subheader_pointer_length * subheader_pointer_index
|
||||
|
||||
subheader_offset = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_length = self._read_int(total_offset, self._int_length)
|
||||
total_offset += self._int_length
|
||||
|
||||
subheader_compression = self._read_int(total_offset, 1)
|
||||
total_offset += 1
|
||||
|
||||
subheader_type = self._read_int(total_offset, 1)
|
||||
|
||||
x = _SubheaderPointer(
|
||||
subheader_offset, subheader_length, subheader_compression, subheader_type
|
||||
)
|
||||
|
||||
return x
|
||||
|
||||
def _read_subheader_signature(self, offset: int) -> bytes:
|
||||
subheader_signature = self._read_bytes(offset, self._int_length)
|
||||
return subheader_signature
|
||||
|
||||
def _process_subheader(
|
||||
self, subheader_index: int, pointer: _SubheaderPointer
|
||||
) -> None:
|
||||
offset = pointer.offset
|
||||
length = pointer.length
|
||||
|
||||
if subheader_index == const.SASIndex.row_size_index:
|
||||
processor = self._process_rowsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_size_index:
|
||||
processor = self._process_columnsize_subheader
|
||||
elif subheader_index == const.SASIndex.column_text_index:
|
||||
processor = self._process_columntext_subheader
|
||||
elif subheader_index == const.SASIndex.column_name_index:
|
||||
processor = self._process_columnname_subheader
|
||||
elif subheader_index == const.SASIndex.column_attributes_index:
|
||||
processor = self._process_columnattributes_subheader
|
||||
elif subheader_index == const.SASIndex.format_and_label_index:
|
||||
processor = self._process_format_subheader
|
||||
elif subheader_index == const.SASIndex.column_list_index:
|
||||
processor = self._process_columnlist_subheader
|
||||
elif subheader_index == const.SASIndex.subheader_counts_index:
|
||||
processor = self._process_subheader_counts
|
||||
elif subheader_index == const.SASIndex.data_subheader_index:
|
||||
self._current_page_data_subheader_pointers.append(pointer)
|
||||
return
|
||||
else:
|
||||
raise ValueError("unknown subheader index")
|
||||
|
||||
processor(offset, length)
|
||||
|
||||
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
|
||||
|
||||
int_len = self._int_length
|
||||
lcs_offset = offset
|
||||
lcp_offset = offset
|
||||
if self.U64:
|
||||
lcs_offset += 682
|
||||
lcp_offset += 706
|
||||
else:
|
||||
lcs_offset += 354
|
||||
lcp_offset += 378
|
||||
|
||||
self.row_length = self._read_int(
|
||||
offset + const.row_length_offset_multiplier * int_len, int_len
|
||||
)
|
||||
self.row_count = self._read_int(
|
||||
offset + const.row_count_offset_multiplier * int_len, int_len
|
||||
)
|
||||
self.col_count_p1 = self._read_int(
|
||||
offset + const.col_count_p1_multiplier * int_len, int_len
|
||||
)
|
||||
self.col_count_p2 = self._read_int(
|
||||
offset + const.col_count_p2_multiplier * int_len, int_len
|
||||
)
|
||||
mx = const.row_count_on_mix_page_offset_multiplier * int_len
|
||||
self._mix_page_row_count = self._read_int(offset + mx, int_len)
|
||||
self._lcs = self._read_int(lcs_offset, 2)
|
||||
self._lcp = self._read_int(lcp_offset, 2)
|
||||
|
||||
def _process_columnsize_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
self.column_count = self._read_int(offset, int_len)
|
||||
if self.col_count_p1 + self.col_count_p2 != self.column_count:
|
||||
print(
|
||||
f"Warning: column count mismatch ({self.col_count_p1} + "
|
||||
f"{self.col_count_p2} != {self.column_count})\n"
|
||||
)
|
||||
|
||||
# Unknown purpose
|
||||
def _process_subheader_counts(self, offset: int, length: int) -> None:
|
||||
pass
|
||||
|
||||
def _process_columntext_subheader(self, offset: int, length: int) -> None:
|
||||
|
||||
offset += self._int_length
|
||||
text_block_size = self._read_int(offset, const.text_block_size_length)
|
||||
|
||||
buf = self._read_bytes(offset, text_block_size)
|
||||
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
|
||||
cname = cname_raw
|
||||
if self.convert_header_text:
|
||||
cname = cname.decode(self.encoding or self.default_encoding)
|
||||
self.column_names_strings.append(cname)
|
||||
|
||||
if len(self.column_names_strings) == 1:
|
||||
compression_literal = b""
|
||||
for cl in const.compression_literals:
|
||||
if cl in cname_raw:
|
||||
compression_literal = cl
|
||||
self.compression = compression_literal
|
||||
offset -= self._int_length
|
||||
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
compression_literal = buf.rstrip(b"\x00")
|
||||
if compression_literal == b"":
|
||||
self._lcs = 0
|
||||
offset1 = offset + 32
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif compression_literal == const.rle_compression:
|
||||
offset1 = offset + 40
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcp)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
elif self._lcs > 0:
|
||||
self._lcp = 0
|
||||
offset1 = offset + 16
|
||||
if self.U64:
|
||||
offset1 += 4
|
||||
buf = self._read_bytes(offset1, self._lcs)
|
||||
self.creator_proc = buf[0 : self._lcp]
|
||||
if self.convert_header_text:
|
||||
if hasattr(self, "creator_proc"):
|
||||
self.creator_proc = self.creator_proc.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
|
||||
def _process_columnname_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
offset += int_len
|
||||
column_name_pointers_count = (length - 2 * int_len - 12) // 8
|
||||
for i in range(column_name_pointers_count):
|
||||
text_subheader = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_text_subheader_offset
|
||||
)
|
||||
col_name_offset = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_offset_offset
|
||||
)
|
||||
col_name_length = (
|
||||
offset
|
||||
+ const.column_name_pointer_length * (i + 1)
|
||||
+ const.column_name_length_offset
|
||||
)
|
||||
|
||||
idx = self._read_int(
|
||||
text_subheader, const.column_name_text_subheader_length
|
||||
)
|
||||
col_offset = self._read_int(
|
||||
col_name_offset, const.column_name_offset_length
|
||||
)
|
||||
col_len = self._read_int(col_name_length, const.column_name_length_length)
|
||||
|
||||
name_str = self.column_names_strings[idx]
|
||||
self.column_names.append(name_str[col_offset : col_offset + col_len])
|
||||
|
||||
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
|
||||
for i in range(column_attributes_vectors_count):
|
||||
col_data_offset = (
|
||||
offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
|
||||
)
|
||||
col_data_len = (
|
||||
offset
|
||||
+ 2 * int_len
|
||||
+ const.column_data_length_offset
|
||||
+ i * (int_len + 8)
|
||||
)
|
||||
col_types = (
|
||||
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
|
||||
)
|
||||
|
||||
x = self._read_int(col_data_offset, int_len)
|
||||
self._column_data_offsets.append(x)
|
||||
|
||||
x = self._read_int(col_data_len, const.column_data_length_length)
|
||||
self._column_data_lengths.append(x)
|
||||
|
||||
x = self._read_int(col_types, const.column_type_length)
|
||||
self._column_types.append(b"d" if x == 1 else b"s")
|
||||
|
||||
def _process_columnlist_subheader(self, offset: int, length: int) -> None:
|
||||
# unknown purpose
|
||||
pass
|
||||
|
||||
def _process_format_subheader(self, offset: int, length: int) -> None:
|
||||
int_len = self._int_length
|
||||
text_subheader_format = (
|
||||
offset + const.column_format_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
|
||||
col_format_len = offset + const.column_format_length_offset + 3 * int_len
|
||||
text_subheader_label = (
|
||||
offset + const.column_label_text_subheader_index_offset + 3 * int_len
|
||||
)
|
||||
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
|
||||
col_label_len = offset + const.column_label_length_offset + 3 * int_len
|
||||
|
||||
x = self._read_int(
|
||||
text_subheader_format, const.column_format_text_subheader_index_length
|
||||
)
|
||||
format_idx = min(x, len(self.column_names_strings) - 1)
|
||||
|
||||
format_start = self._read_int(
|
||||
col_format_offset, const.column_format_offset_length
|
||||
)
|
||||
format_len = self._read_int(col_format_len, const.column_format_length_length)
|
||||
|
||||
label_idx = self._read_int(
|
||||
text_subheader_label, const.column_label_text_subheader_index_length
|
||||
)
|
||||
label_idx = min(label_idx, len(self.column_names_strings) - 1)
|
||||
|
||||
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
|
||||
label_len = self._read_int(col_label_len, const.column_label_length_length)
|
||||
|
||||
label_names = self.column_names_strings[label_idx]
|
||||
column_label = label_names[label_start : label_start + label_len]
|
||||
format_names = self.column_names_strings[format_idx]
|
||||
column_format = format_names[format_start : format_start + format_len]
|
||||
current_column_number = len(self.columns)
|
||||
|
||||
col = _Column(
|
||||
current_column_number,
|
||||
self.column_names[current_column_number],
|
||||
column_label,
|
||||
column_format,
|
||||
self._column_types[current_column_number],
|
||||
self._column_data_lengths[current_column_number],
|
||||
)
|
||||
|
||||
self.column_formats.append(column_format)
|
||||
self.columns.append(col)
|
||||
|
||||
def read(self, nrows: int | None = None) -> DataFrame | None:
|
||||
|
||||
if (nrows is None) and (self.chunksize is not None):
|
||||
nrows = self.chunksize
|
||||
elif nrows is None:
|
||||
nrows = self.row_count
|
||||
|
||||
if len(self._column_types) == 0:
|
||||
self.close()
|
||||
raise EmptyDataError("No columns to parse from file")
|
||||
|
||||
if self._current_row_in_file_index >= self.row_count:
|
||||
return None
|
||||
|
||||
m = self.row_count - self._current_row_in_file_index
|
||||
if nrows > m:
|
||||
nrows = m
|
||||
|
||||
nd = self._column_types.count(b"d")
|
||||
ns = self._column_types.count(b"s")
|
||||
|
||||
self._string_chunk = np.empty((ns, nrows), dtype=object)
|
||||
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
|
||||
|
||||
self._current_row_in_chunk_index = 0
|
||||
p = Parser(self)
|
||||
p.read(nrows)
|
||||
|
||||
rslt = self._chunk_to_dataframe()
|
||||
if self.index is not None:
|
||||
rslt = rslt.set_index(self.index)
|
||||
|
||||
return rslt
|
||||
|
||||
def _read_next_page(self):
|
||||
self._current_page_data_subheader_pointers = []
|
||||
self._cached_page = self._path_or_buf.read(self._page_length)
|
||||
if len(self._cached_page) <= 0:
|
||||
return True
|
||||
elif len(self._cached_page) != self._page_length:
|
||||
self.close()
|
||||
msg = (
|
||||
"failed to read complete page from file (read "
|
||||
f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
self._read_page_header()
|
||||
page_type = self._current_page_type
|
||||
if page_type == const.page_meta_type:
|
||||
self._process_page_metadata()
|
||||
|
||||
is_data_page = page_type & const.page_data_type
|
||||
pt = [const.page_meta_type] + const.page_mix_types
|
||||
if not is_data_page and self._current_page_type not in pt:
|
||||
return self._read_next_page()
|
||||
|
||||
return False
|
||||
|
||||
def _chunk_to_dataframe(self) -> DataFrame:
|
||||
|
||||
n = self._current_row_in_chunk_index
|
||||
m = self._current_row_in_file_index
|
||||
ix = range(m - n, m)
|
||||
rslt = {}
|
||||
|
||||
js, jb = 0, 0
|
||||
for j in range(self.column_count):
|
||||
|
||||
name = self.column_names[j]
|
||||
|
||||
if self._column_types[j] == b"d":
|
||||
col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
|
||||
rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix)
|
||||
if self.convert_dates:
|
||||
if self.column_formats[j] in const.sas_date_formats:
|
||||
rslt[name] = _convert_datetimes(rslt[name], "d")
|
||||
elif self.column_formats[j] in const.sas_datetime_formats:
|
||||
rslt[name] = _convert_datetimes(rslt[name], "s")
|
||||
jb += 1
|
||||
elif self._column_types[j] == b"s":
|
||||
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix)
|
||||
if self.convert_text and (self.encoding is not None):
|
||||
rslt[name] = rslt[name].str.decode(
|
||||
self.encoding or self.default_encoding
|
||||
)
|
||||
if self.blank_missing:
|
||||
ii = rslt[name].str.len() == 0
|
||||
rslt[name][ii] = np.nan
|
||||
js += 1
|
||||
else:
|
||||
self.close()
|
||||
raise ValueError(f"unknown column type {repr(self._column_types[j])}")
|
||||
|
||||
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
|
||||
return df
|
253
.venv/Lib/site-packages/pandas/io/sas/sas_constants.py
Normal file
253
.venv/Lib/site-packages/pandas/io/sas/sas_constants.py
Normal file
@ -0,0 +1,253 @@
|
||||
magic = (
|
||||
b"\x00\x00\x00\x00\x00\x00\x00\x00"
|
||||
+ b"\x00\x00\x00\x00\xc2\xea\x81\x60"
|
||||
+ b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
|
||||
+ b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
|
||||
)
|
||||
|
||||
align_1_checker_value = b"3"
|
||||
align_1_offset = 32
|
||||
align_1_length = 1
|
||||
align_1_value = 4
|
||||
u64_byte_checker_value = b"3"
|
||||
align_2_offset = 35
|
||||
align_2_length = 1
|
||||
align_2_value = 4
|
||||
endianness_offset = 37
|
||||
endianness_length = 1
|
||||
platform_offset = 39
|
||||
platform_length = 1
|
||||
encoding_offset = 70
|
||||
encoding_length = 1
|
||||
dataset_offset = 92
|
||||
dataset_length = 64
|
||||
file_type_offset = 156
|
||||
file_type_length = 8
|
||||
date_created_offset = 164
|
||||
date_created_length = 8
|
||||
date_modified_offset = 172
|
||||
date_modified_length = 8
|
||||
header_size_offset = 196
|
||||
header_size_length = 4
|
||||
page_size_offset = 200
|
||||
page_size_length = 4
|
||||
page_count_offset = 204
|
||||
page_count_length = 4
|
||||
sas_release_offset = 216
|
||||
sas_release_length = 8
|
||||
sas_server_type_offset = 224
|
||||
sas_server_type_length = 16
|
||||
os_version_number_offset = 240
|
||||
os_version_number_length = 16
|
||||
os_maker_offset = 256
|
||||
os_maker_length = 16
|
||||
os_name_offset = 272
|
||||
os_name_length = 16
|
||||
page_bit_offset_x86 = 16
|
||||
page_bit_offset_x64 = 32
|
||||
subheader_pointer_length_x86 = 12
|
||||
subheader_pointer_length_x64 = 24
|
||||
page_type_offset = 0
|
||||
page_type_length = 2
|
||||
block_count_offset = 2
|
||||
block_count_length = 2
|
||||
subheader_count_offset = 4
|
||||
subheader_count_length = 2
|
||||
page_meta_type = 0
|
||||
page_data_type = 256
|
||||
page_amd_type = 1024
|
||||
page_metc_type = 16384
|
||||
page_comp_type = -28672
|
||||
page_mix_types = [512, 640]
|
||||
subheader_pointers_offset = 8
|
||||
truncated_subheader_id = 1
|
||||
compressed_subheader_id = 4
|
||||
compressed_subheader_type = 1
|
||||
text_block_size_length = 2
|
||||
row_length_offset_multiplier = 5
|
||||
row_count_offset_multiplier = 6
|
||||
col_count_p1_multiplier = 9
|
||||
col_count_p2_multiplier = 10
|
||||
row_count_on_mix_page_offset_multiplier = 15
|
||||
column_name_pointer_length = 8
|
||||
column_name_text_subheader_offset = 0
|
||||
column_name_text_subheader_length = 2
|
||||
column_name_offset_offset = 2
|
||||
column_name_offset_length = 2
|
||||
column_name_length_offset = 4
|
||||
column_name_length_length = 2
|
||||
column_data_offset_offset = 8
|
||||
column_data_length_offset = 8
|
||||
column_data_length_length = 4
|
||||
column_type_offset = 14
|
||||
column_type_length = 1
|
||||
column_format_text_subheader_index_offset = 22
|
||||
column_format_text_subheader_index_length = 2
|
||||
column_format_offset_offset = 24
|
||||
column_format_offset_length = 2
|
||||
column_format_length_offset = 26
|
||||
column_format_length_length = 2
|
||||
column_label_text_subheader_index_offset = 28
|
||||
column_label_text_subheader_index_length = 2
|
||||
column_label_offset_offset = 30
|
||||
column_label_offset_length = 2
|
||||
column_label_length_offset = 32
|
||||
column_label_length_length = 2
|
||||
rle_compression = b"SASYZCRL"
|
||||
rdc_compression = b"SASYZCR2"
|
||||
|
||||
compression_literals = [rle_compression, rdc_compression]
|
||||
|
||||
# Incomplete list of encodings, using SAS nomenclature:
|
||||
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
|
||||
encoding_names = {
|
||||
29: "latin1",
|
||||
20: "utf-8",
|
||||
33: "cyrillic",
|
||||
60: "wlatin2",
|
||||
61: "wcyrillic",
|
||||
62: "wlatin1",
|
||||
90: "ebcdic870",
|
||||
}
|
||||
|
||||
|
||||
class SASIndex:
|
||||
row_size_index = 0
|
||||
column_size_index = 1
|
||||
subheader_counts_index = 2
|
||||
column_text_index = 3
|
||||
column_name_index = 4
|
||||
column_attributes_index = 5
|
||||
format_and_label_index = 6
|
||||
column_list_index = 7
|
||||
data_subheader_index = 8
|
||||
|
||||
|
||||
subheader_signature_to_index = {
|
||||
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
|
||||
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
|
||||
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
|
||||
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
|
||||
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
|
||||
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
|
||||
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
|
||||
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
|
||||
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
|
||||
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
|
||||
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
|
||||
}
|
||||
|
||||
|
||||
# List of frequently used SAS date and datetime formats
|
||||
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
|
||||
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
|
||||
sas_date_formats = (
|
||||
"DATE",
|
||||
"DAY",
|
||||
"DDMMYY",
|
||||
"DOWNAME",
|
||||
"JULDAY",
|
||||
"JULIAN",
|
||||
"MMDDYY",
|
||||
"MMYY",
|
||||
"MMYYC",
|
||||
"MMYYD",
|
||||
"MMYYP",
|
||||
"MMYYS",
|
||||
"MMYYN",
|
||||
"MONNAME",
|
||||
"MONTH",
|
||||
"MONYY",
|
||||
"QTR",
|
||||
"QTRR",
|
||||
"NENGO",
|
||||
"WEEKDATE",
|
||||
"WEEKDATX",
|
||||
"WEEKDAY",
|
||||
"WEEKV",
|
||||
"WORDDATE",
|
||||
"WORDDATX",
|
||||
"YEAR",
|
||||
"YYMM",
|
||||
"YYMMC",
|
||||
"YYMMD",
|
||||
"YYMMP",
|
||||
"YYMMS",
|
||||
"YYMMN",
|
||||
"YYMON",
|
||||
"YYMMDD",
|
||||
"YYQ",
|
||||
"YYQC",
|
||||
"YYQD",
|
||||
"YYQP",
|
||||
"YYQS",
|
||||
"YYQN",
|
||||
"YYQR",
|
||||
"YYQRC",
|
||||
"YYQRD",
|
||||
"YYQRP",
|
||||
"YYQRS",
|
||||
"YYQRN",
|
||||
"YYMMDDP",
|
||||
"YYMMDDC",
|
||||
"E8601DA",
|
||||
"YYMMDDN",
|
||||
"MMDDYYC",
|
||||
"MMDDYYS",
|
||||
"MMDDYYD",
|
||||
"YYMMDDS",
|
||||
"B8601DA",
|
||||
"DDMMYYN",
|
||||
"YYMMDDD",
|
||||
"DDMMYYB",
|
||||
"DDMMYYP",
|
||||
"MMDDYYP",
|
||||
"YYMMDDB",
|
||||
"MMDDYYN",
|
||||
"DDMMYYC",
|
||||
"DDMMYYD",
|
||||
"DDMMYYS",
|
||||
"MINGUO",
|
||||
)
|
||||
|
||||
sas_datetime_formats = (
|
||||
"DATETIME",
|
||||
"DTWKDATX",
|
||||
"B8601DN",
|
||||
"B8601DT",
|
||||
"B8601DX",
|
||||
"B8601DZ",
|
||||
"B8601LX",
|
||||
"E8601DN",
|
||||
"E8601DT",
|
||||
"E8601DX",
|
||||
"E8601DZ",
|
||||
"E8601LX",
|
||||
"DATEAMPM",
|
||||
"DTDATE",
|
||||
"DTMONYY",
|
||||
"DTMONYY",
|
||||
"DTWKDATX",
|
||||
"DTYEAR",
|
||||
"TOD",
|
||||
"MDYAMPM",
|
||||
)
|
496
.venv/Lib/site-packages/pandas/io/sas/sas_xport.py
Normal file
496
.venv/Lib/site-packages/pandas/io/sas/sas_xport.py
Normal file
@ -0,0 +1,496 @@
|
||||
"""
|
||||
Read a SAS XPort format file into a Pandas DataFrame.
|
||||
|
||||
Based on code from Jack Cushman (github.com/jcushman/xport).
|
||||
|
||||
The file format is defined here:
|
||||
|
||||
https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import abc
|
||||
from datetime import datetime
|
||||
import struct
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
from pandas.util._decorators import Appender
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from pandas.io.common import get_handle
|
||||
from pandas.io.sas.sasreader import ReaderBase
|
||||
|
||||
_correct_line1 = (
|
||||
"HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_correct_header1 = (
|
||||
"HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000"
|
||||
)
|
||||
_correct_header2 = (
|
||||
"HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_correct_obs_header = (
|
||||
"HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
|
||||
"000000000000000000000000000000 "
|
||||
)
|
||||
_fieldkeys = [
|
||||
"ntype",
|
||||
"nhfun",
|
||||
"field_length",
|
||||
"nvar0",
|
||||
"name",
|
||||
"label",
|
||||
"nform",
|
||||
"nfl",
|
||||
"num_decimals",
|
||||
"nfj",
|
||||
"nfill",
|
||||
"niform",
|
||||
"nifl",
|
||||
"nifd",
|
||||
"npos",
|
||||
"_",
|
||||
]
|
||||
|
||||
|
||||
_base_params_doc = """\
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str or file-like object
|
||||
Path to SAS file or object implementing binary read method."""
|
||||
|
||||
_params2_doc = """\
|
||||
index : identifier of index column
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : str
|
||||
Encoding for text data.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator."""
|
||||
|
||||
_format_params_doc = """\
|
||||
format : str
|
||||
File format, only `xport` is currently supported."""
|
||||
|
||||
_iterator_doc = """\
|
||||
iterator : bool, default False
|
||||
Return XportReader object for reading file incrementally."""
|
||||
|
||||
|
||||
_read_sas_doc = f"""Read a SAS file into a DataFrame.
|
||||
|
||||
{_base_params_doc}
|
||||
{_format_params_doc}
|
||||
{_params2_doc}
|
||||
{_iterator_doc}
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame or XportReader
|
||||
|
||||
Examples
|
||||
--------
|
||||
Read a SAS Xport file:
|
||||
|
||||
>>> df = pd.read_sas('filename.XPT')
|
||||
|
||||
Read a Xport file in 10,000 line chunks:
|
||||
|
||||
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
|
||||
>>> for chunk in itr:
|
||||
>>> do_something(chunk)
|
||||
|
||||
"""
|
||||
|
||||
_xport_reader_doc = f"""\
|
||||
Class for reading SAS Xport files.
|
||||
|
||||
{_base_params_doc}
|
||||
{_params2_doc}
|
||||
|
||||
Attributes
|
||||
----------
|
||||
member_info : list
|
||||
Contains information about the file
|
||||
fields : list
|
||||
Contains information about the variables in the file
|
||||
"""
|
||||
|
||||
_read_method_doc = """\
|
||||
Read observations from SAS Xport file, returning as data frame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nrows : int
|
||||
Number of rows to read from data file; if None, read whole
|
||||
file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame.
|
||||
"""
|
||||
|
||||
|
||||
def _parse_date(datestr: str) -> datetime:
|
||||
"""Given a date in xport format, return Python date."""
|
||||
try:
|
||||
# e.g. "16FEB11:10:07:55"
|
||||
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
|
||||
except ValueError:
|
||||
return pd.NaT
|
||||
|
||||
|
||||
def _split_line(s: str, parts):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
s: str
|
||||
Fixed-length string to split
|
||||
parts: list of (name, length) pairs
|
||||
Used to break up string, name '_' will be filtered from output.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict of name:contents of string at given location.
|
||||
"""
|
||||
out = {}
|
||||
start = 0
|
||||
for name, length in parts:
|
||||
out[name] = s[start : start + length].strip()
|
||||
start += length
|
||||
del out["_"]
|
||||
return out
|
||||
|
||||
|
||||
def _handle_truncated_float_vec(vec, nbytes):
|
||||
# This feature is not well documented, but some SAS XPORT files
|
||||
# have 2-7 byte "truncated" floats. To read these truncated
|
||||
# floats, pad them with zeros on the right to make 8 byte floats.
|
||||
#
|
||||
# References:
|
||||
# https://github.com/jcushman/xport/pull/3
|
||||
# The R "foreign" library
|
||||
|
||||
if nbytes != 8:
|
||||
vec1 = np.zeros(len(vec), np.dtype("S8"))
|
||||
dtype = np.dtype(f"S{nbytes},S{8 - nbytes}")
|
||||
vec2 = vec1.view(dtype=dtype)
|
||||
vec2["f0"] = vec
|
||||
return vec2
|
||||
|
||||
return vec
|
||||
|
||||
|
||||
def _parse_float_vec(vec):
|
||||
"""
|
||||
Parse a vector of float values representing IBM 8 byte floats into
|
||||
native 8 byte floats.
|
||||
"""
|
||||
dtype = np.dtype(">u4,>u4")
|
||||
vec1 = vec.view(dtype=dtype)
|
||||
xport1 = vec1["f0"]
|
||||
xport2 = vec1["f1"]
|
||||
|
||||
# Start by setting first half of ieee number to first half of IBM
|
||||
# number sans exponent
|
||||
ieee1 = xport1 & 0x00FFFFFF
|
||||
|
||||
# The fraction bit to the left of the binary point in the ieee
|
||||
# format was set and the number was shifted 0, 1, 2, or 3
|
||||
# places. This will tell us how to adjust the ibm exponent to be a
|
||||
# power of 2 ieee exponent and how to shift the fraction bits to
|
||||
# restore the correct magnitude.
|
||||
shift = np.zeros(len(vec), dtype=np.uint8)
|
||||
shift[np.where(xport1 & 0x00200000)] = 1
|
||||
shift[np.where(xport1 & 0x00400000)] = 2
|
||||
shift[np.where(xport1 & 0x00800000)] = 3
|
||||
|
||||
# shift the ieee number down the correct number of places then
|
||||
# set the second half of the ieee number to be the second half
|
||||
# of the ibm number shifted appropriately, ored with the bits
|
||||
# from the first half that would have been shifted in if we
|
||||
# could shift a double. All we are worried about are the low
|
||||
# order 3 bits of the first half since we're only shifting by
|
||||
# 1, 2, or 3.
|
||||
ieee1 >>= shift
|
||||
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
|
||||
|
||||
# clear the 1 bit to the left of the binary point
|
||||
ieee1 &= 0xFFEFFFFF
|
||||
|
||||
# set the exponent of the ieee number to be the actual exponent
|
||||
# plus the shift count + 1023. Or this into the first half of the
|
||||
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
|
||||
# since during conversion to ibm format the exponent is
|
||||
# incremented by 1 and the fraction bits left 4 positions to the
|
||||
# right of the radix point. (had to add >> 24 because C treats &
|
||||
# 0x7f as 0x7f000000 and Python doesn't)
|
||||
ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
|
||||
xport1 & 0x80000000
|
||||
)
|
||||
|
||||
ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
|
||||
ieee["f0"] = ieee1
|
||||
ieee["f1"] = ieee2
|
||||
ieee = ieee.view(dtype=">f8")
|
||||
ieee = ieee.astype("f8")
|
||||
|
||||
return ieee
|
||||
|
||||
|
||||
class XportReader(ReaderBase, abc.Iterator):
|
||||
__doc__ = _xport_reader_doc
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
index=None,
|
||||
encoding: str | None = "ISO-8859-1",
|
||||
chunksize=None,
|
||||
):
|
||||
|
||||
self._encoding = encoding
|
||||
self._lines_read = 0
|
||||
self._index = index
|
||||
self._chunksize = chunksize
|
||||
|
||||
self.handles = get_handle(
|
||||
filepath_or_buffer, "rb", encoding=encoding, is_text=False
|
||||
)
|
||||
self.filepath_or_buffer = self.handles.handle
|
||||
|
||||
try:
|
||||
self._read_header()
|
||||
except Exception:
|
||||
self.close()
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
self.handles.close()
|
||||
|
||||
def _get_row(self):
|
||||
return self.filepath_or_buffer.read(80).decode()
|
||||
|
||||
def _read_header(self):
|
||||
self.filepath_or_buffer.seek(0)
|
||||
|
||||
# read file header
|
||||
line1 = self._get_row()
|
||||
if line1 != _correct_line1:
|
||||
if "**COMPRESSED**" in line1:
|
||||
# this was created with the PROC CPORT method and can't be read
|
||||
# https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm
|
||||
raise ValueError(
|
||||
"Header record indicates a CPORT file, which is not readable."
|
||||
)
|
||||
raise ValueError("Header record is not an XPORT file.")
|
||||
|
||||
line2 = self._get_row()
|
||||
fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
|
||||
file_info = _split_line(line2, fif)
|
||||
if file_info["prefix"] != "SAS SAS SASLIB":
|
||||
raise ValueError("Header record has invalid prefix.")
|
||||
file_info["created"] = _parse_date(file_info["created"])
|
||||
self.file_info = file_info
|
||||
|
||||
line3 = self._get_row()
|
||||
file_info["modified"] = _parse_date(line3[:16])
|
||||
|
||||
# read member header
|
||||
header1 = self._get_row()
|
||||
header2 = self._get_row()
|
||||
headflag1 = header1.startswith(_correct_header1)
|
||||
headflag2 = header2 == _correct_header2
|
||||
if not (headflag1 and headflag2):
|
||||
raise ValueError("Member header not found")
|
||||
# usually 140, could be 135
|
||||
fieldnamelength = int(header1[-5:-2])
|
||||
|
||||
# member info
|
||||
mem = [
|
||||
["prefix", 8],
|
||||
["set_name", 8],
|
||||
["sasdata", 8],
|
||||
["version", 8],
|
||||
["OS", 8],
|
||||
["_", 24],
|
||||
["created", 16],
|
||||
]
|
||||
member_info = _split_line(self._get_row(), mem)
|
||||
mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
|
||||
member_info.update(_split_line(self._get_row(), mem))
|
||||
member_info["modified"] = _parse_date(member_info["modified"])
|
||||
member_info["created"] = _parse_date(member_info["created"])
|
||||
self.member_info = member_info
|
||||
|
||||
# read field names
|
||||
types = {1: "numeric", 2: "char"}
|
||||
fieldcount = int(self._get_row()[54:58])
|
||||
datalength = fieldnamelength * fieldcount
|
||||
# round up to nearest 80
|
||||
if datalength % 80:
|
||||
datalength += 80 - datalength % 80
|
||||
fielddata = self.filepath_or_buffer.read(datalength)
|
||||
fields = []
|
||||
obs_length = 0
|
||||
while len(fielddata) >= fieldnamelength:
|
||||
# pull data for one field
|
||||
fieldbytes, fielddata = (
|
||||
fielddata[:fieldnamelength],
|
||||
fielddata[fieldnamelength:],
|
||||
)
|
||||
|
||||
# rest at end gets ignored, so if field is short, pad out
|
||||
# to match struct pattern below
|
||||
fieldbytes = fieldbytes.ljust(140)
|
||||
|
||||
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
|
||||
field = dict(zip(_fieldkeys, fieldstruct))
|
||||
del field["_"]
|
||||
field["ntype"] = types[field["ntype"]]
|
||||
fl = field["field_length"]
|
||||
if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
|
||||
msg = f"Floating field width {fl} is not between 2 and 8."
|
||||
raise TypeError(msg)
|
||||
|
||||
for k, v in field.items():
|
||||
try:
|
||||
field[k] = v.strip()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
obs_length += field["field_length"]
|
||||
fields += [field]
|
||||
|
||||
header = self._get_row()
|
||||
if not header == _correct_obs_header:
|
||||
raise ValueError("Observation header not found.")
|
||||
|
||||
self.fields = fields
|
||||
self.record_length = obs_length
|
||||
self.record_start = self.filepath_or_buffer.tell()
|
||||
|
||||
self.nobs = self._record_count()
|
||||
self.columns = [x["name"].decode() for x in self.fields]
|
||||
|
||||
# Setup the dtype.
|
||||
dtypel = [
|
||||
("s" + str(i), "S" + str(field["field_length"]))
|
||||
for i, field in enumerate(self.fields)
|
||||
]
|
||||
dtype = np.dtype(dtypel)
|
||||
self._dtype = dtype
|
||||
|
||||
def __next__(self):
|
||||
return self.read(nrows=self._chunksize or 1)
|
||||
|
||||
def _record_count(self) -> int:
|
||||
"""
|
||||
Get number of records in file.
|
||||
|
||||
This is maybe suboptimal because we have to seek to the end of
|
||||
the file.
|
||||
|
||||
Side effect: returns file position to record_start.
|
||||
"""
|
||||
self.filepath_or_buffer.seek(0, 2)
|
||||
total_records_length = self.filepath_or_buffer.tell() - self.record_start
|
||||
|
||||
if total_records_length % 80 != 0:
|
||||
warnings.warn("xport file may be corrupted.")
|
||||
|
||||
if self.record_length > 80:
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
return total_records_length // self.record_length
|
||||
|
||||
self.filepath_or_buffer.seek(-80, 2)
|
||||
last_card_bytes = self.filepath_or_buffer.read(80)
|
||||
last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)
|
||||
|
||||
# 8 byte blank
|
||||
ix = np.flatnonzero(last_card == 2314885530818453536)
|
||||
|
||||
if len(ix) == 0:
|
||||
tail_pad = 0
|
||||
else:
|
||||
tail_pad = 8 * len(ix)
|
||||
|
||||
self.filepath_or_buffer.seek(self.record_start)
|
||||
|
||||
return (total_records_length - tail_pad) // self.record_length
|
||||
|
||||
def get_chunk(self, size=None):
|
||||
"""
|
||||
Reads lines from Xport file and returns as dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, defaults to None
|
||||
Number of lines to read. If None, reads whole file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
if size is None:
|
||||
size = self._chunksize
|
||||
return self.read(nrows=size)
|
||||
|
||||
def _missing_double(self, vec):
|
||||
v = vec.view(dtype="u1,u1,u2,u4")
|
||||
miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
|
||||
miss1 = (
|
||||
((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
|
||||
| (v["f0"] == 0x5F)
|
||||
| (v["f0"] == 0x2E)
|
||||
)
|
||||
miss &= miss1
|
||||
return miss
|
||||
|
||||
@Appender(_read_method_doc)
|
||||
def read(self, nrows=None):
|
||||
|
||||
if nrows is None:
|
||||
nrows = self.nobs
|
||||
|
||||
read_lines = min(nrows, self.nobs - self._lines_read)
|
||||
read_len = read_lines * self.record_length
|
||||
if read_len <= 0:
|
||||
self.close()
|
||||
raise StopIteration
|
||||
raw = self.filepath_or_buffer.read(read_len)
|
||||
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
|
||||
|
||||
df = pd.DataFrame(index=range(read_lines))
|
||||
for j, x in enumerate(self.columns):
|
||||
vec = data["s" + str(j)]
|
||||
ntype = self.fields[j]["ntype"]
|
||||
if ntype == "numeric":
|
||||
vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
|
||||
miss = self._missing_double(vec)
|
||||
v = _parse_float_vec(vec)
|
||||
v[miss] = np.nan
|
||||
elif self.fields[j]["ntype"] == "char":
|
||||
v = [y.rstrip() for y in vec]
|
||||
|
||||
if self._encoding is not None:
|
||||
v = [y.decode(self._encoding) for y in v]
|
||||
|
||||
df[x] = v
|
||||
|
||||
if self._index is None:
|
||||
df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
|
||||
else:
|
||||
df = df.set_index(self._index)
|
||||
|
||||
self._lines_read += read_lines
|
||||
|
||||
return df
|
158
.venv/Lib/site-packages/pandas/io/sas/sasreader.py
Normal file
158
.venv/Lib/site-packages/pandas/io/sas/sasreader.py
Normal file
@ -0,0 +1,158 @@
|
||||
"""
|
||||
Read SAS sas7bdat or xport files.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import (
|
||||
ABCMeta,
|
||||
abstractmethod,
|
||||
)
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Hashable,
|
||||
overload,
|
||||
)
|
||||
|
||||
from pandas._typing import (
|
||||
FilePath,
|
||||
ReadBuffer,
|
||||
)
|
||||
|
||||
from pandas.io.common import stringify_path
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
# TODO(PY38): replace with Protocol in Python 3.8
|
||||
class ReaderBase(metaclass=ABCMeta):
|
||||
"""
|
||||
Protocol for XportReader and SAS7BDATReader classes.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def read(self, nrows=None):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.close()
|
||||
|
||||
|
||||
@overload
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
format: str | None = ...,
|
||||
index: Hashable | None = ...,
|
||||
encoding: str | None = ...,
|
||||
chunksize: int = ...,
|
||||
iterator: bool = ...,
|
||||
) -> ReaderBase:
|
||||
...
|
||||
|
||||
|
||||
@overload
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
format: str | None = ...,
|
||||
index: Hashable | None = ...,
|
||||
encoding: str | None = ...,
|
||||
chunksize: None = ...,
|
||||
iterator: bool = ...,
|
||||
) -> DataFrame | ReaderBase:
|
||||
...
|
||||
|
||||
|
||||
def read_sas(
|
||||
filepath_or_buffer: FilePath | ReadBuffer[bytes],
|
||||
format: str | None = None,
|
||||
index: Hashable | None = None,
|
||||
encoding: str | None = None,
|
||||
chunksize: int | None = None,
|
||||
iterator: bool = False,
|
||||
) -> DataFrame | ReaderBase:
|
||||
"""
|
||||
Read SAS files stored as either XPORT or SAS7BDAT format files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filepath_or_buffer : str, path object, or file-like object
|
||||
String, path object (implementing ``os.PathLike[str]``), or file-like
|
||||
object implementing a binary ``read()`` function. The string could be a URL.
|
||||
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
||||
expected. A local file could be:
|
||||
``file://localhost/path/to/table.sas``.
|
||||
format : str {'xport', 'sas7bdat'} or None
|
||||
If None, file format is inferred from file extension. If 'xport' or
|
||||
'sas7bdat', uses the corresponding format.
|
||||
index : identifier of index column, defaults to None
|
||||
Identifier of column that should be used as index of the DataFrame.
|
||||
encoding : str, default is None
|
||||
Encoding for text data. If None, text data are stored as raw bytes.
|
||||
chunksize : int
|
||||
Read file `chunksize` lines at a time, returns iterator.
|
||||
|
||||
.. versionchanged:: 1.2
|
||||
|
||||
``TextFileReader`` is a context manager.
|
||||
iterator : bool, defaults to False
|
||||
If True, returns an iterator for reading the file incrementally.
|
||||
|
||||
.. versionchanged:: 1.2
|
||||
|
||||
``TextFileReader`` is a context manager.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
|
||||
or XportReader
|
||||
"""
|
||||
if format is None:
|
||||
buffer_error_msg = (
|
||||
"If this is a buffer object rather "
|
||||
"than a string name, you must specify a format string"
|
||||
)
|
||||
filepath_or_buffer = stringify_path(filepath_or_buffer)
|
||||
if not isinstance(filepath_or_buffer, str):
|
||||
raise ValueError(buffer_error_msg)
|
||||
fname = filepath_or_buffer.lower()
|
||||
if fname.endswith(".xpt"):
|
||||
format = "xport"
|
||||
elif fname.endswith(".sas7bdat"):
|
||||
format = "sas7bdat"
|
||||
else:
|
||||
raise ValueError("unable to infer format of SAS file")
|
||||
|
||||
reader: ReaderBase
|
||||
if format.lower() == "xport":
|
||||
from pandas.io.sas.sas_xport import XportReader
|
||||
|
||||
reader = XportReader(
|
||||
filepath_or_buffer,
|
||||
index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize,
|
||||
)
|
||||
elif format.lower() == "sas7bdat":
|
||||
from pandas.io.sas.sas7bdat import SAS7BDATReader
|
||||
|
||||
reader = SAS7BDATReader(
|
||||
filepath_or_buffer,
|
||||
index=index,
|
||||
encoding=encoding,
|
||||
chunksize=chunksize,
|
||||
)
|
||||
else:
|
||||
raise ValueError("unknown SAS format")
|
||||
|
||||
if iterator or chunksize:
|
||||
return reader
|
||||
|
||||
with reader:
|
||||
return reader.read()
|
Reference in New Issue
Block a user