1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
use crate::datatypes::{DataType, Field, TimeUnit};
use ahash::AHashSet;
pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z";
fn is_boolean(bytes: &[u8]) -> bool {
bytes.eq_ignore_ascii_case(b"true") | bytes.eq_ignore_ascii_case(b"false")
}
fn is_float(bytes: &[u8]) -> bool {
lexical_core::parse::<f64>(bytes).is_ok()
}
fn is_integer(bytes: &[u8]) -> bool {
lexical_core::parse::<i64>(bytes).is_ok()
}
fn is_date(string: &str) -> bool {
string.parse::<chrono::NaiveDate>().is_ok()
}
fn is_time(string: &str) -> bool {
string.parse::<chrono::NaiveTime>().is_ok()
}
fn is_naive_datetime(string: &str) -> bool {
string.parse::<chrono::NaiveDateTime>().is_ok()
}
fn is_datetime(string: &str) -> Option<String> {
let mut parsed = chrono::format::Parsed::new();
let fmt = chrono::format::StrftimeItems::new(RFC3339);
if chrono::format::parse(&mut parsed, string, fmt).is_ok() {
parsed.offset.map(|x| {
let hours = x / 60 / 60;
let minutes = x / 60 - hours * 60;
format!("{:03}:{:02}", hours, minutes)
})
} else {
None
}
}
pub fn infer(bytes: &[u8]) -> DataType {
if is_boolean(bytes) {
DataType::Boolean
} else if is_integer(bytes) {
DataType::Int64
} else if is_float(bytes) {
DataType::Float64
} else if let Ok(string) = simdutf8::basic::from_utf8(bytes) {
if is_date(string) {
DataType::Date32
} else if is_time(string) {
DataType::Time32(TimeUnit::Millisecond)
} else if is_naive_datetime(string) {
DataType::Timestamp(TimeUnit::Millisecond, None)
} else if let Some(offset) = is_datetime(string) {
DataType::Timestamp(TimeUnit::Millisecond, Some(offset))
} else {
DataType::Utf8
}
} else {
DataType::Binary
}
}
fn merge_fields(field_name: &str, possibilities: &mut AHashSet<DataType>) -> Field {
let data_type = match possibilities.len() {
1 => possibilities.drain().next().unwrap(),
2 => {
if possibilities.contains(&DataType::Int64)
&& possibilities.contains(&DataType::Float64)
{
DataType::Float64
} else {
DataType::Utf8
}
}
_ => DataType::Utf8,
};
Field::new(field_name, data_type, true)
}
pub(crate) fn merge_schema(
headers: &[String],
column_types: &mut [AHashSet<DataType>],
) -> Vec<Field> {
headers
.iter()
.zip(column_types.iter_mut())
.map(|(field_name, possibilities)| merge_fields(field_name, possibilities))
.collect()
}