The Art Gallery Guardian

The Aho–Corasick Automaton in Haskell


It is common that one might want to match \(k\) different strings against one single text of length \(m\). One can of course apply the KMP algorithm individually, and result an algorithm that runs in \(O(km)\) time.

Faster algorithms are known. The idea is to build an finite state transducer that can output which strings is the suffix of the string it read. The Aho-Corasick automaton is a compressed version of such transducer, as the size does not depend on the size of the alphabet.

import           Control.Arrow (first)
import           Data.Function (on)
import           Data.List     (lookup, partition)
import           Data.Maybe    (fromMaybe, Maybe (..))
import           Data.Monoid   (All (..), Monoid, getAll, mappend, mconcat,
                                mempty)
data Automaton a b = Node {delta  :: a -> Automaton a b,
                           output :: b
                         }

equivalentClasses :: (a->a->Bool)->[a]->[[a]]
equivalentClasses eq = foldl parts []
  where parts [] a = [[a]]
        parts (x:xs) a
         | eq (head x) a = (a:x):xs
         | otherwise     = x:parts xs a

buildAutomaton :: (Monoid b,Eq a) => [([a],b)] -> Automaton a b
buildAutomaton xs = automaton
  where automaton = build (const automaton) xs mempty

build :: (Monoid b,Eq a)=> (a -> Automaton a b) -> [([a],b)] -> b -> Automaton a b
build trans xs out = node
  where node  = Node (\x->fromMaybe (trans x) (lookup x table)) out
        table =  map transPair $ equivalentClasses (on (==) (head . fst)) xs
        transPair xs = (a, build (delta (trans a)) ys out)
         where a  = head $ fst $ head xs
               (ys,zs) = partition (not . null . fst) $ map (first tail) xs
               out = mappend (mconcat $ map snd zs) (output $ trans a)

match :: Eq a => Automaton a b -> [a] -> [b]
match a xs = map output $ scanl delta a xs

match' :: Eq a => [[a]] -> [a] -> [[[a]]]
match' pat = match (buildAutomaton $ map (\x-> (x,[x])) pat)

isInfixOf' :: Eq a => [a] -> [a] -> Bool
isInfixOf' xs ys = getAll $ mconcat $ match (buildAutomaton [(xs, All True)]) ys
Posted by Chao Xu on 2014-04-25.
Tags: .