mealy-decompose/src/DotParser.hs

{-# LANGUAGE OverloadedStrings #-}

module DotParser where

import Mealy

import Data.Char (isAlphaNum)
import Data.HashMap.Strict qualified as Map
import Data.HashSet qualified as Set
import Data.Maybe (mapMaybe)
import Data.Text qualified as T
import Data.Text.IO qualified as T
import Data.Void (Void)
import Text.Megaparsec
import Text.Megaparsec.Char (hspace, hspace1, space)
import Text.Megaparsec.Char.Lexer qualified as L

-- * Main function(s)

-- $moduleDoc
--
-- Parser for Dot files generated by LearnLib (and other tools). This is not
-- a fully fledged parser for dot files. It is specific to our encoding of
-- Mealy machines. The parser works line-based, so transitions need to be
-- on a single line. It assumes the first transitions belongs to the initial
-- state. The parser ignores any line which is not a transition.

-- | Read a dot file and convert it to a Mealy machine.
readDotFile :: FilePath -> IO (MealyMachine T.Text T.Text T.Text)
readDotFile dotFile = convertToMealy . mapMaybe (parseMaybe lineP) . T.lines <$> T.readFile dotFile
 where
  -- Parses a full line, ignoring whitespace, until the end of the line
  lineP = hspace *> parseTrans <* space <* eof

-- * Internals

-- | Type of tokens we accept is `T.Text`, but it could be any type which
-- is compatible with Megaparsec.
type Toks = T.Text

-- | A transition is a tuple of state, successor state, input label, and
-- output label
type Trans = (Toks, Toks, Toks, Toks)

-- | Our parser does not have any custom error messages, and always consumes
-- a stream as `T.Text`.
type Parser = Parsec Void Toks

-- | Parse a single transition.
parseTrans :: Parser Trans
parseTrans = assoc <$> identifierQ <* symbol "->" <*> identifierQ <*> brackets parseLabel <* optional (symbol ";")
 where
  -- defines whitespace and lexemes
  sc = L.space hspace1 empty empty
  lexeme = L.lexeme sc
  symbol = L.symbol sc
  -- state, input, output is any string of alphaNumChar's (and some additional characters)
  isAlphaNumExtra c = isAlphaNum c || ('(' <= c && c <= '.') || c == '_'
  alphaNumCharExtra = takeWhile1P (Just "alphanumeric character or extra") isAlphaNumExtra
  identifier = lexeme alphaNumCharExtra
  identifierQ = identifier <|> between (symbol "\"") (symbol "\"") identifier
  -- The label has the shape [label="i/o"]
  brackets = between (symbol "[") (symbol "]")
  parseLabel = (,) <$> (symbol "label=\"" *> identifier <* symbol "/") <*> (identifier <* symbol "\"")
  -- re-associate different parts of the parser
  assoc from to (i, o) = (from, to, i, o)

-- | Convert a list of transitions to a Mealy machine, assumes the first
-- transition belongs to the initial state.
convertToMealy :: [Trans] -> MealyMachine T.Text T.Text T.Text
convertToMealy l =
  MealyMachine
    { states = fmap (allStrs Map.!) . Set.toList $ states
    , inputs = Set.toList $ ins
    , outputs = Set.toList $ outs
    , behaviour = curry (base Map.!)
    , initialState = (allStrs Map.!) . (\(a, _, _, _) -> a) . head $ l
    }
 where
  states = Set.fromList . concatMap (\(a, b, _, _) -> [a, b]) $ l
  ins = Set.fromList . fmap (\(_, _, i, _) -> i) $ l
  outs = Set.fromList . fmap (\(_, _, _, o) -> o) $ l
  -- We put some effort in sharing string values
  allStrs = Map.mapWithKey (\k _ -> k) . Set.toMap . Set.unions $ [states, ins, outs]
  base = Map.fromList . fmap (\(from, to, i, o) -> ((allStrs Map.! from, allStrs Map.! i), (allStrs Map.! o, allStrs Map.! to))) $ l

{- Notes
~~~~~~~~

- Originally I used a `Data.Map` data structure to hold the transitions. But
  it turns out that `Data.HashMap` is much faster. This is important, because
  the `behaviour` function is called a lot during partition refinement. This
  is also the reason for moving from `String` to `T.Text`.

- Even better would be to move the `Int`. But that makes debugging and
  experimenting with the code harder. It is now efficient enough for our
  purposes.

- I have tried @ShortText@ from the @short-text@ package, but it provided
  no benefit over `T.Text`.

-}