JSON read

When compiled with feature io_json, you can use this crate to read NDJSON files:

use std::fs::File;
use std::io::{BufReader, Seek};

use arrow2::array::Array;
use arrow2::error::Result;
use arrow2::io::ndjson::read;
use arrow2::io::ndjson::read::FallibleStreamingIterator;

fn read_path(path: &str) -> Result<Vec<Box<dyn Array>>> {
    let batch_size = 1024; // number of rows per array
    let mut reader = BufReader::new(File::open(path)?);

    let data_type = read::infer(&mut reader, None)?;
    reader.rewind()?;

    let mut reader = read::FileReader::new(reader, vec!["".to_string(); batch_size], None);

    let mut arrays = vec![];
    // `next` is IO-bounded
    while let Some(rows) = reader.next()? {
        // `deserialize` is CPU-bounded
        let array = read::deserialize(rows, data_type.clone())?;
        arrays.push(array);
    }

    Ok(arrays)
}

fn main() -> Result<()> {
    // Example of reading a NDJSON file from a path
    use std::env;
    let args: Vec<String> = env::args().collect();

    let file_path = &args[1];

    let arrays = read_path(file_path)?;
    println!("{arrays:#?}");
    Ok(())
}

Note how deserialization can be performed on a separate thread pool to avoid blocking the runtime (see also here).

This crate also supports reading JSON, at the expense of being unable to read the file in chunks.

/// Example of reading a JSON file.
use std::fs;

use arrow2::array::Array;
use arrow2::error::Result;
use arrow2::io::json::read;

fn read_path(path: &str) -> Result<Box<dyn Array>> {
    // read the file into memory (IO-bounded)
    let data = fs::read(path)?;

    // create a non-owning struct of the data (CPU-bounded)
    let json = read::json_deserializer::parse(&data)?;

    // use it to infer an Arrow schema (CPU-bounded)
    let data_type = read::infer(&json)?;

    // and deserialize it (CPU-bounded)
    read::deserialize(&json, data_type)
}

fn main() -> Result<()> {
    use std::env;
    let args: Vec<String> = env::args().collect();

    let file_path = &args[1];

    let batch = read_path(file_path)?;
    println!("{batch:#?}");
    Ok(())
}

Metadata and inference

This crate uses the following mapping between Arrow's data type and JSON:

JSONDataType
BoolBoolean
IntInt64
FloatFloat64
StringUtf8
ListList
ObjectStruct