Regular expression for a interval of non-negative integers
Let and . I wrote a program in Haskell that generate a regular expression that matches all decimal representation of some number in between and inclusive. The regular expression would have length . I also included a simple version, which shows how the algorithm is done.
The code was made for the decimal system, but of course it can be generalized to any base.
Example:
matchIntRange 123 4321
1(2[3-9]|[3-9]\d)|[2-9]\d\d|[123]\d{3}|4([012]\d\d|3([01]\d|2[01]))
The main idea is to consider a set of tries. Each one contain strings with the same length. The regular expression has a structure closely related to this trie. It be nice if there is a polytime algorithm to generate the shortest regular expression for this problem if all we can use is (), |.
It is important to also handle cases where we will have repeated elements
matchIntRange 12121212 12121212
(12){4}
This is done by recursively try to compress a free monoid element with the FreeMonoidCompress module.
module RegIntIntval (RegEx, matchIntRange, matchLessThan, matchGreaterThan) where | |
import Data.Digits | |
import Text.Regex.PCRE.Light | |
import Data.ByteString (pack) | |
import Data.ByteString.Internal (c2w) | |
import Data.Maybe | |
import Data.List | |
import FreeMonoidCompress | |
-- build regular expression for positive integer ranges | |
-- in base 10. The program uses operations [x-y],|,\d,*,(),{} | |
-- Internal representaion | |
data RegEx = Range Int Int | All | AtLeast RegEx Int | |
| Or [RegEx] | Concat [RegEx] | Repetition RegEx Int Int | |
| Epsilon | Line RegEx | |
deriving (Eq, Ord)-- , Show) | |
-- You could also use "[0-9]" if you like | |
alphabet = "\\d" | |
-- Output the regular expression | |
instance Show RegEx where | |
show (Range i j) | |
| i == j = show i | |
| i+1 == j = concat ["[",show i,show j,"]"] | |
| i+2 == j = concat ["[",show i,show (i+1), show (i+2),"]"] | |
| otherwise = concat ["[",show i,"-",show j,"]"] | |
show (Or xs) = "("++ tail (concatMap (\x->'|':show x) xs) ++ ")" | |
show (Concat xs) = concatMap show xs | |
show (Repetition x n m) | |
| n /= m = concat [c,"{",show n,",",show m,"}"] | |
| length a < length b = a | |
| otherwise = b | |
where a = concat [c,"{",show n,"}"] | |
b = concat (replicate n (show x)) | |
c = if atomic x then show x else "(" ++ show x ++ ")" | |
show Epsilon = "" | |
show (AtLeast x n) | |
| n == 0 = show x ++ "*" | |
| n == 1 = show x ++ "+" | |
| length a < length b = a | |
| otherwise = b | |
where a = concat (replicate n (show x))++"+" | |
b = concat [show x,"{",show n,",}"] | |
show (Line x) = "^" ++ show x ++ "$" | |
show All = alphabet | |
atomic All = True | |
atomic (Range _ _) = True | |
atomic (Concat xs) | |
| length xs > 1 = False | |
| otherwise = atomic (head xs) | |
atomic (Or _) = True | |
atomic Epsilon = True | |
atomic (AtLeast _ _) = False | |
-- Main method. Match integers in a certain range | |
matchIntRange :: Integer->Integer->RegEx | |
matchIntRange a b | |
| 0 > min a b = error "Negative input" | |
| a > b = Line Epsilon | |
| otherwise = Line (mergeBetter $ reduce $ build (d a) (d b)) | |
-- | otherwise = Line (reduce $ build (d a) (d b)) | |
where build :: [Int]->[Int]->RegEx | |
build [] [] = Concat [] | |
build (a@(x:xs)) (b@(y:ys)) | |
| sl && x == y = Concat [Range x x, build xs ys] | |
| sl && all9 && all0 = Concat [Range x y, Repetition All n n] | |
| sl && all0 = Or [Concat [Range x (y-1), Repetition All n n], upper] | |
| sl && all9 = Or [lower,Concat [Range (x+1) y, Repetition All n n]] | |
| sl && x+1 <= y-1 = Or [lower, middle, upper] | |
| sl = Or [lower, upper] | |
| otherwise = Or [build a (nines la), build (1:zeros la) b] | |
where (la,lb) = (length a, length b) | |
sl = la == lb | |
n = length xs | |
upper = Concat [Range y y, build (zeros n) ys] | |
lower = Concat [Range x x, build xs (nines n)] | |
middle = Concat [Range (x+1) (y-1), Repetition All n n] | |
all9 = all (==9) ys | |
all0 = all (==0) xs | |
zeros n = replicate n 0 | |
nines n = replicate n 9 | |
d 0 = [0] | |
d n = map fromIntegral $ digits 10 n | |
-- Transforms | |
-- We should reduce the associative operations | |
-- After reduce, (Or xs) would be (Or ys), where ys contain no Or's. | |
reduce :: RegEx->RegEx | |
reduce (Or xs) | |
| length xs == 1 = head xs | |
| otherwise = Or (reverse $ foldl combine [] result) | |
where result = map reduce xs | |
combine agg (Or ys) = reverse ys++agg | |
combine agg n = n:agg | |
reduce (Concat xs) | |
| length xs == 1 = head xs | |
| otherwise = Concat (reverse $ foldl combine [] result) | |
where result = map reduce xs | |
combine agg (Concat ys) = reverse ys ++ agg | |
combine agg n = n:agg | |
reduce x = x | |
-- merge repeated set of all strings together | |
-- commutative thus sorting is nice | |
mergeAll :: RegEx->RegEx | |
mergeAll (Or xs) = Or (f [] v) | |
where v :: [RegEx] | |
v = sort $ map mergeAll xs | |
u :: RegEx->[RegEx] | |
u (Concat x) = x | |
u x = [] | |
f :: [RegEx]->[RegEx]->[RegEx] | |
f [] (x:xs) = f [x] xs | |
f accum [] = accum | |
f accum xs = f (init accum ++ merges (last accum) (head xs)) (tail xs) | |
merges :: RegEx->RegEx->[RegEx] | |
merges xs ys | |
| notnull && px == py = m sx sy | |
| otherwise = [xs,ys] | |
where m (Repetition x a b) (Repetition y c d) | |
| x == y && a<=d && b>=c || b+1 == c = [Concat (px ++ [Repetition x (min a c) (max b d)])] | |
| otherwise = [xs,ys] | |
m (Repetition x a b) (AtLeast y c) | |
| x == y && c<=b+1 = [Concat (px ++ [AtLeast x (min a c)])] | |
| otherwise = [xs,ys] | |
m x y = [xs,ys] | |
(px,sx) = (init $ u xs, last $ u xs) | |
(py,sy) = (init $ u ys, last $ u ys) | |
notnull = min (length $ u xs) (length $ u ys) > 0 | |
-- Not commutative | |
mergeAll (Concat xs) = mergeMonoid (Concat (map mergeAll xs)) | |
mergeAll x = x | |
atom (Power _ _) = False | |
atom (Atom x) = atomic x | |
atom (List xs) -- = False | |
| length xs > 1 = False | |
| otherwise = atom (head xs) | |
mergeMonoid (Concat xs) = Concat (expand result) | |
where result = freeMonoidCompress regexWeight xs | |
expand (Atom x) = [x] | |
expand (List xs) = concatMap expand xs | |
expand (Power x n) = [Repetition (Concat (expand x)) n n] | |
regexWeight (Atom x) = length $ show x | |
regexWeight (List xs) = sum $ map regexWeight xs | |
regexWeight (Power x n) = min (regexWeight $ List (replicate n x)) (overhead1 + 2 + length (show n) + w) | |
where w = regexWeight x | |
overhead1 = if (atom x) then 0 else 2 | |
-- because we like the ordering a lot | |
mergeBetter x | |
| length (show x) > length (show y) = y | |
| otherwise = x | |
where y = mergeAll x | |
matchLessThan :: Integer->RegEx | |
matchLessThan n = Line (matchIntRange 0 n) | |
matchGreaterThan :: Integer->RegEx | |
matchGreaterThan a = Line (mergeBetter $ reduce $ Or [matchIntRange a b,Concat [Range 1 9, AtLeast All n]]) | |
where n = length $ digits 10 a | |
b = unDigits 10 $ replicate n 9 | |
-- Tests | |
--Assert if matchIntRange i j works | |
test i j = all isJust [match re (num k) []|k<-[i..j]] && | |
all isNothing [match re (num k) []|k<-[0..i-1]] && | |
all isNothing [match re (num k) []|k<-[j+1..9*j]] | |
where re = compile (s2w ("^("++show (mergeAll $ matchIntRange i j)++")$") ) [] | |
num n = s2w $ show n | |
s2w = pack . map c2w |