JSON read
When compiled with feature io_json
, you can use this crate to read NDJSON files:
use std::fs::File; use std::io::{BufReader, Seek}; use arrow2::array::Array; use arrow2::error::Result; use arrow2::io::ndjson::read; use arrow2::io::ndjson::read::FallibleStreamingIterator; fn read_path(path: &str) -> Result<Vec<Box<dyn Array>>> { let batch_size = 1024; // number of rows per array let mut reader = BufReader::new(File::open(path)?); let data_type = read::infer(&mut reader, None)?; reader.rewind()?; let mut reader = read::FileReader::new(reader, vec!["".to_string(); batch_size], None); let mut arrays = vec![]; // `next` is IO-bounded while let Some(rows) = reader.next()? { // `deserialize` is CPU-bounded let array = read::deserialize(rows, data_type.clone())?; arrays.push(array); } Ok(arrays) } fn main() -> Result<()> { // Example of reading a NDJSON file from a path use std::env; let args: Vec<String> = env::args().collect(); let file_path = &args[1]; let arrays = read_path(file_path)?; println!("{:#?}", arrays); Ok(()) }
Note how deserialization can be performed on a separate thread pool to avoid blocking the runtime (see also here).
This crate also supports reading JSON, at the expense of being unable to read the file in chunks.
/// Example of reading a JSON file. use std::fs; use arrow2::array::Array; use arrow2::error::Result; use arrow2::io::json::read; fn read_path(path: &str) -> Result<Box<dyn Array>> { // read the file into memory (IO-bounded) let data = fs::read(path)?; // create a non-owning struct of the data (CPU-bounded) let json = read::json_deserializer::parse(&data)?; // use it to infer an Arrow schema (CPU-bounded) let data_type = read::infer(&json)?; // and deserialize it (CPU-bounded) read::deserialize(&json, data_type) } fn main() -> Result<()> { use std::env; let args: Vec<String> = env::args().collect(); let file_path = &args[1]; let batch = read_path(file_path)?; println!("{:#?}", batch); Ok(()) }
Metadata and inference
This crate uses the following mapping between Arrow's data type and JSON:
JSON | DataType |
---|---|
Bool | Boolean |
Int | Int64 |
Float | Float64 |
String | Utf8 |
List | List |
Object | Struct |