1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
//! Contains regex matching operators [`regex_match`] and [`regex_match_scalar`].

use ahash::AHashMap;
use regex::Regex;

use super::utils::combine_validities;
use crate::array::{BooleanArray, Offset, Utf8Array};
use crate::bitmap::Bitmap;
use crate::datatypes::DataType;
use crate::error::{Error, Result};

/// Regex matches
pub fn regex_match<O: Offset>(values: &Utf8Array<O>, regex: &Utf8Array<O>) -> Result<BooleanArray> {
    if values.len() != regex.len() {
        return Err(Error::InvalidArgumentError(
            "Cannot perform comparison operation on arrays of different length".to_string(),
        ));
    }

    let mut map = AHashMap::new();
    let validity = combine_validities(values.validity(), regex.validity());

    let iterator = values.iter().zip(regex.iter()).map(|(haystack, regex)| {
        if haystack.is_none() | regex.is_none() {
            // regex is expensive => short-circuit if null
            return Result::Ok(false);
        };
        let haystack = haystack.unwrap();
        let regex = regex.unwrap();

        let regex = if let Some(regex) = map.get(regex) {
            regex
        } else {
            let re = Regex::new(regex).map_err(|e| {
                Error::InvalidArgumentError(format!(
                    "Unable to build regex from LIKE pattern: {}",
                    e
                ))
            })?;
            map.insert(regex, re);
            map.get(regex).unwrap()
        };

        Ok(regex.is_match(haystack))
    });
    let new_values = Bitmap::try_from_trusted_len_iter(iterator)?;

    Ok(BooleanArray::new(DataType::Boolean, new_values, validity))
}

/// Regex matches
/// # Example
/// ```
/// use arrow2::array::{Utf8Array, BooleanArray};
/// use arrow2::compute::regex_match::regex_match_scalar;
///
/// let strings = Utf8Array::<i32>::from_slice(&vec!["ArAow", "A_B", "AAA"]);
///
/// let result = regex_match_scalar(&strings, "^A.A").unwrap();
/// assert_eq!(result, BooleanArray::from_slice(&vec![true, false, true]));
/// ```
pub fn regex_match_scalar<O: Offset>(values: &Utf8Array<O>, regex: &str) -> Result<BooleanArray> {
    let regex = Regex::new(regex)
        .map_err(|e| Error::InvalidArgumentError(format!("Unable to compile regex: {}", e)))?;
    Ok(unary_utf8_boolean(values, |x| regex.is_match(x)))
}

fn unary_utf8_boolean<O: Offset, F: Fn(&str) -> bool>(
    values: &Utf8Array<O>,
    op: F,
) -> BooleanArray {
    let validity = values.validity().cloned();

    let iterator = values.iter().map(|value| {
        if value.is_none() {
            return false;
        };
        op(value.unwrap())
    });
    let values = Bitmap::from_trusted_len_iter(iterator);
    BooleanArray::new(DataType::Boolean, values, validity)
}