-
Notifications
You must be signed in to change notification settings - Fork 802
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add ParquetError::NeedMoreData
mark ParquetError
as non_exhaustive
#6630
Open
etseidl
wants to merge
5
commits into
apache:master
Choose a base branch
from
etseidl:need_more_data
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 3 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
3a753d0
add ParquetError::NeedMoreData
etseidl 5460f29
make ParquetError non_exhaustive
etseidl 6aafcad
attempt to better describe use of NeedMoreData
etseidl 6969808
add test of multiple retries
etseidl decece0
Merge remote-tracking branch 'origin/master' into need_more_data
etseidl File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -179,7 +179,9 @@ impl ParquetMetaDataReader { | |
/// # Errors | ||
/// | ||
/// This function will return [`ParquetError::NeedMoreData`] in the event `reader` does not | ||
/// provide enough data to fully parse the metadata (see example below). | ||
/// provide enough data to fully parse the metadata (see example below). The returned error | ||
/// will be populated with a `usize` field indicating the number of bytes required from the | ||
/// tail of the file to completely parse the requested metadata. | ||
/// | ||
/// Other errors returned include [`ParquetError::General`] and [`ParquetError::EOF`]. | ||
/// | ||
|
@@ -192,18 +194,54 @@ impl ParquetMetaDataReader { | |
/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); } | ||
/// let file = open_parquet_file("some_path.parquet"); | ||
/// let len = file.len() as usize; | ||
/// let bytes = get_bytes(&file, 1000..len); | ||
/// // Speculatively read 1 kilobyte from the end of the file | ||
/// let bytes = get_bytes(&file, len - 1024..len); | ||
/// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true); | ||
/// match reader.try_parse_sized(&bytes, len) { | ||
/// Ok(_) => (), | ||
/// Err(ParquetError::NeedMoreData(needed)) => { | ||
/// // Read the needed number of bytes from the end of the file | ||
/// let bytes = get_bytes(&file, len - needed..len); | ||
/// reader.try_parse_sized(&bytes, len).unwrap(); | ||
/// } | ||
/// _ => panic!("unexpected error") | ||
/// } | ||
/// let metadata = reader.finish().unwrap(); | ||
/// ``` | ||
/// | ||
/// Note that it is possible for the file metadata to be completely read, but there are | ||
/// insufficient bytes available to read the page indexes. [`Self::has_metadata()`] can be used | ||
/// to test for this. In the event the file metadata is present, re-parsing of the file | ||
/// metadata can be skipped by using [`Self::read_page_indexes_sized()`], as shown below. | ||
/// ```no_run | ||
/// # use parquet::file::metadata::ParquetMetaDataReader; | ||
/// # use parquet::errors::ParquetError; | ||
/// # use crate::parquet::file::reader::Length; | ||
/// # fn get_bytes(file: &std::fs::File, range: std::ops::Range<usize>) -> bytes::Bytes { unimplemented!(); } | ||
/// # fn open_parquet_file(path: &str) -> std::fs::File { unimplemented!(); } | ||
/// let file = open_parquet_file("some_path.parquet"); | ||
/// let len = file.len() as usize; | ||
/// // Speculatively read 1 kilobyte from the end of the file | ||
/// let mut bytes = get_bytes(&file, len - 1024..len); | ||
/// let mut reader = ParquetMetaDataReader::new().with_page_indexes(true); | ||
/// // Loop until `bytes` is large enough | ||
/// loop { | ||
/// match reader.try_parse_sized(&bytes, len) { | ||
/// Ok(_) => break, | ||
/// Err(ParquetError::NeedMoreData(needed)) => { | ||
/// // Read the needed number of bytes from the end of the file | ||
/// bytes = get_bytes(&file, len - needed..len); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
/// // If file metadata was read only read page indexes, otherwise continue loop | ||
/// if reader.has_metadata() { | ||
/// reader.read_page_indexes_sized(&bytes, len); | ||
/// break; | ||
/// } | ||
/// } | ||
/// _ => panic!("unexpected error") | ||
/// } | ||
/// } | ||
/// let metadata = reader.finish().unwrap(); | ||
/// ``` | ||
pub fn try_parse_sized<R: ChunkReader>(&mut self, reader: &R, file_size: usize) -> Result<()> { | ||
self.metadata = match self.parse_metadata(reader) { | ||
Ok(metadata) => Some(metadata), | ||
|
@@ -241,7 +279,8 @@ impl ParquetMetaDataReader { | |
/// Read the page index structures when a [`ParquetMetaData`] has already been obtained. | ||
/// This variant is used when `reader` cannot access the entire Parquet file (e.g. it is | ||
/// a [`Bytes`] struct containing the tail of the file). | ||
/// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`]. | ||
/// See [`Self::new_with_metadata()`] and [`Self::has_metadata()`]. Like | ||
/// [`Self::try_parse_sized()`] this function may return [`ParquetError::NeedMoreData`]. | ||
pub fn read_page_indexes_sized<R: ChunkReader>( | ||
&mut self, | ||
reader: &R, | ||
|
@@ -790,6 +829,26 @@ mod tests { | |
_ => panic!("unexpected error"), | ||
}; | ||
|
||
// not enough for file metadata, but keep trying until page indexes are read | ||
let mut reader = ParquetMetaDataReader::new().with_page_indexes(true); | ||
let mut bytes = bytes_for_range(452505..len); | ||
loop { | ||
match reader.try_parse_sized(&bytes, len) { | ||
Ok(_) => break, | ||
Err(ParquetError::NeedMoreData(needed)) => { | ||
bytes = bytes_for_range(len - needed..len); | ||
if reader.has_metadata() { | ||
reader.read_page_indexes_sized(&bytes, len).unwrap(); | ||
break; | ||
} | ||
} | ||
_ => panic!("unexpected error"), | ||
} | ||
} | ||
let metadata = reader.finish().unwrap(); | ||
assert!(metadata.column_index.is_some()); | ||
assert!(metadata.offset_index.is_some()); | ||
|
||
// not enough for page index but lie about file size | ||
let bytes = bytes_for_range(323584..len); | ||
let reader_result = reader.try_parse_sized(&bytes, len - 323584).unwrap_err(); | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we also mark
ParquetError
non exhaustive at the same time (so future additions won't be breaking API changes)?Like this:
arrow-rs/object_store/src/lib.rs
Line 1228 in 56525ef