mirror of
				https://github.com/aykhans/AzSuicideDataVisualization.git
				synced 2025-10-31 10:09:59 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			323 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			323 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Licensed to the Apache Software Foundation (ASF) under one
 | |
| # or more contributor license agreements.  See the NOTICE file
 | |
| # distributed with this work for additional information
 | |
| # regarding copyright ownership.  The ASF licenses this file
 | |
| # to you under the Apache License, Version 2.0 (the
 | |
| # "License"); you may not use this file except in compliance
 | |
| # with the License.  You may obtain a copy of the License at
 | |
| #
 | |
| #   http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing,
 | |
| # software distributed under the License is distributed on an
 | |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 | |
| # KIND, either express or implied.  See the License for the
 | |
| # specific language governing permissions and limitations
 | |
| # under the License.
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| import pyarrow as pa
 | |
| from pyarrow import fs
 | |
| from pyarrow.filesystem import FileSystem, LocalFileSystem
 | |
| from pyarrow.tests.parquet.common import parametrize_legacy_dataset
 | |
| 
 | |
| try:
 | |
|     import pyarrow.parquet as pq
 | |
|     from pyarrow.tests.parquet.common import _read_table, _test_dataframe
 | |
| except ImportError:
 | |
|     pq = None
 | |
| 
 | |
| 
 | |
| try:
 | |
|     import pandas as pd
 | |
|     import pandas.testing as tm
 | |
| 
 | |
| except ImportError:
 | |
|     pd = tm = None
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @parametrize_legacy_dataset
 | |
| def test_parquet_incremental_file_build(tempdir, use_legacy_dataset):
 | |
|     df = _test_dataframe(100)
 | |
|     df['unique_id'] = 0
 | |
| 
 | |
|     arrow_table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|     out = pa.BufferOutputStream()
 | |
| 
 | |
|     writer = pq.ParquetWriter(out, arrow_table.schema, version='2.6')
 | |
| 
 | |
|     frames = []
 | |
|     for i in range(10):
 | |
|         df['unique_id'] = i
 | |
|         arrow_table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|         writer.write_table(arrow_table)
 | |
| 
 | |
|         frames.append(df.copy())
 | |
| 
 | |
|     writer.close()
 | |
| 
 | |
|     buf = out.getvalue()
 | |
|     result = _read_table(
 | |
|         pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
 | |
| 
 | |
|     expected = pd.concat(frames, ignore_index=True)
 | |
|     tm.assert_frame_equal(result.to_pandas(), expected)
 | |
| 
 | |
| 
 | |
| def test_validate_schema_write_table(tempdir):
 | |
|     # ARROW-2926
 | |
|     simple_fields = [
 | |
|         pa.field('POS', pa.uint32()),
 | |
|         pa.field('desc', pa.string())
 | |
|     ]
 | |
| 
 | |
|     simple_schema = pa.schema(simple_fields)
 | |
| 
 | |
|     # simple_table schema does not match simple_schema
 | |
|     simple_from_array = [pa.array([1]), pa.array(['bla'])]
 | |
|     simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
 | |
| 
 | |
|     path = tempdir / 'simple_validate_schema.parquet'
 | |
| 
 | |
|     with pq.ParquetWriter(path, simple_schema,
 | |
|                           version='2.6',
 | |
|                           compression='snappy', flavor='spark') as w:
 | |
|         with pytest.raises(ValueError):
 | |
|             w.write_table(simple_table)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @parametrize_legacy_dataset
 | |
| def test_parquet_writer_context_obj(tempdir, use_legacy_dataset):
 | |
|     df = _test_dataframe(100)
 | |
|     df['unique_id'] = 0
 | |
| 
 | |
|     arrow_table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|     out = pa.BufferOutputStream()
 | |
| 
 | |
|     with pq.ParquetWriter(out, arrow_table.schema, version='2.6') as writer:
 | |
| 
 | |
|         frames = []
 | |
|         for i in range(10):
 | |
|             df['unique_id'] = i
 | |
|             arrow_table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|             writer.write_table(arrow_table)
 | |
| 
 | |
|             frames.append(df.copy())
 | |
| 
 | |
|     buf = out.getvalue()
 | |
|     result = _read_table(
 | |
|         pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
 | |
| 
 | |
|     expected = pd.concat(frames, ignore_index=True)
 | |
|     tm.assert_frame_equal(result.to_pandas(), expected)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @parametrize_legacy_dataset
 | |
| def test_parquet_writer_context_obj_with_exception(
 | |
|     tempdir, use_legacy_dataset
 | |
| ):
 | |
|     df = _test_dataframe(100)
 | |
|     df['unique_id'] = 0
 | |
| 
 | |
|     arrow_table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|     out = pa.BufferOutputStream()
 | |
|     error_text = 'Artificial Error'
 | |
| 
 | |
|     try:
 | |
|         with pq.ParquetWriter(out,
 | |
|                               arrow_table.schema,
 | |
|                               version='2.6') as writer:
 | |
| 
 | |
|             frames = []
 | |
|             for i in range(10):
 | |
|                 df['unique_id'] = i
 | |
|                 arrow_table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|                 writer.write_table(arrow_table)
 | |
|                 frames.append(df.copy())
 | |
|                 if i == 5:
 | |
|                     raise ValueError(error_text)
 | |
|     except Exception as e:
 | |
|         assert str(e) == error_text
 | |
| 
 | |
|     buf = out.getvalue()
 | |
|     result = _read_table(
 | |
|         pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
 | |
| 
 | |
|     expected = pd.concat(frames, ignore_index=True)
 | |
|     tm.assert_frame_equal(result.to_pandas(), expected)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @pytest.mark.parametrize("filesystem", [
 | |
|     None,
 | |
|     LocalFileSystem._get_instance(),
 | |
|     fs.LocalFileSystem(),
 | |
| ])
 | |
| def test_parquet_writer_write_wrappers(tempdir, filesystem):
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|     batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
 | |
|     path_table = str(tempdir / 'data_table.parquet')
 | |
|     path_batch = str(tempdir / 'data_batch.parquet')
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path_table, table.schema, filesystem=filesystem, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write_table(table)
 | |
| 
 | |
|     result = _read_table(path_table).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path_batch, table.schema, filesystem=filesystem, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write_batch(batch)
 | |
| 
 | |
|     result = _read_table(path_batch).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path_table, table.schema, filesystem=filesystem, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write(table)
 | |
| 
 | |
|     result = _read_table(path_table).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path_batch, table.schema, filesystem=filesystem, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write(batch)
 | |
| 
 | |
|     result = _read_table(path_batch).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @pytest.mark.parametrize("filesystem", [
 | |
|     None,
 | |
|     LocalFileSystem._get_instance(),
 | |
|     fs.LocalFileSystem(),
 | |
| ])
 | |
| def test_parquet_writer_filesystem_local(tempdir, filesystem):
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|     path = str(tempdir / 'data.parquet')
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path, table.schema, filesystem=filesystem, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write_table(table)
 | |
| 
 | |
|     result = _read_table(path).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @pytest.mark.s3
 | |
| def test_parquet_writer_filesystem_s3(s3_example_fs):
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
| 
 | |
|     fs, uri, path = s3_example_fs
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path, table.schema, filesystem=fs, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write_table(table)
 | |
| 
 | |
|     result = _read_table(uri).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @pytest.mark.s3
 | |
| def test_parquet_writer_filesystem_s3_uri(s3_example_fs):
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
| 
 | |
|     fs, uri, path = s3_example_fs
 | |
| 
 | |
|     with pq.ParquetWriter(uri, table.schema, version='2.6') as writer:
 | |
|         writer.write_table(table)
 | |
| 
 | |
|     result = _read_table(path, filesystem=fs).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @pytest.mark.s3
 | |
| def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
| 
 | |
|     fs, directory = s3_example_s3fs
 | |
|     path = directory + "/test.parquet"
 | |
| 
 | |
|     with pq.ParquetWriter(
 | |
|         path, table.schema, filesystem=fs, version='2.6'
 | |
|     ) as writer:
 | |
|         writer.write_table(table)
 | |
| 
 | |
|     result = _read_table(path, filesystem=fs).to_pandas()
 | |
|     tm.assert_frame_equal(result, df)
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| def test_parquet_writer_filesystem_buffer_raises():
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
|     filesystem = fs.LocalFileSystem()
 | |
| 
 | |
|     # Should raise ValueError when filesystem is passed with file-like object
 | |
|     with pytest.raises(ValueError, match="specified path is file-like"):
 | |
|         pq.ParquetWriter(
 | |
|             pa.BufferOutputStream(), table.schema, filesystem=filesystem
 | |
|         )
 | |
| 
 | |
| 
 | |
| @pytest.mark.pandas
 | |
| @parametrize_legacy_dataset
 | |
| def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset):
 | |
|     out = pa.BufferOutputStream()
 | |
| 
 | |
|     class CustomFS(FileSystem):
 | |
|         def __init__(self):
 | |
|             self.path = None
 | |
|             self.mode = None
 | |
| 
 | |
|         def open(self, path, mode='rb'):
 | |
|             self.path = path
 | |
|             self.mode = mode
 | |
|             return out
 | |
| 
 | |
|     fs = CustomFS()
 | |
|     fname = 'expected_fname.parquet'
 | |
|     df = _test_dataframe(100)
 | |
|     table = pa.Table.from_pandas(df, preserve_index=False)
 | |
| 
 | |
|     with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.6') \
 | |
|             as writer:
 | |
|         writer.write_table(table)
 | |
| 
 | |
|     assert fs.path == fname
 | |
|     assert fs.mode == 'wb'
 | |
|     assert out.closed
 | |
| 
 | |
|     buf = out.getvalue()
 | |
|     table_read = _read_table(
 | |
|         pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset)
 | |
|     df_read = table_read.to_pandas()
 | |
|     tm.assert_frame_equal(df_read, df)
 | |
| 
 | |
|     # Should raise ValueError when filesystem is passed with file-like object
 | |
|     with pytest.raises(ValueError) as err_info:
 | |
|         pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs)
 | |
|         expected_msg = ("filesystem passed but where is file-like, so"
 | |
|                         " there is nothing to open with filesystem.")
 | |
|         assert str(err_info) == expected_msg
 | 
