/* --- Copyright University of Sussex 1995. All rights reserved. ----------
 > File:            $poplocal/local/lib/quick_substrings.p
 > Purpose:         Quick searching for patterns in strings
 > Author:          David S Young, Dec 23 1994 (see revisions)
 > Documentation:   HELP * QUICK_SUBSTRINGS
 */

compile_mode:pop11 +strict;

section;

define lconstant checkstring(s);
    lvars s;
    ;;; Used to check that fast_subscrs can be used.
    ;;; Next constant will be 8 on current machines
    lconstant (bytesize, ) = field_spec_info("byte");
    unless class_spec(datakey(s)) == bytesize then
        mishap(dataword(s), 1, 'String or byte vector needed')
    endunless
enddefine;

define lconstant set_table(shift, pat, lpt, rpt, table);
    lvars shift, pat, lpt, rpt, table;

    ;;; This is called when p characters working left from pat(s) have been
    ;;; matched with the right hand end of pat. (See the help file.)
    ;;; shift is the shift for a miss, equal to patlen-lpt.
    ;;; lpt is the index of the next character to inspect, equal to s-p.
    ;;; rpt is index of next character to compare, equal to patlen-p.
    ;;; table is the list of properties, starting with entry p+1.

    lvars t, ch;
    if lpt == 0 then          ;;; have got back to start of pat - set default
        fast_for t on table do          ;;; for j from s to patlen-1 do
            shift -> fast_front(t);     ;;;     patlen-s+j -> Table(j+1,*)
            shift fi_+ 1 -> shift
        endfor
    else
        if (fast_subscrs(lpt, pat) ->> ch) == fast_subscrs(rpt, pat) then
            set_table(
                shift fi_+ 1, pat, lpt fi_- 1, rpt fi_- 1, fast_back(table)
            )
        else
            fast_front(table) -> t;     ;;; either integer default or prop
            unless t.isproperty then    ;;; build prop with given default
                newanyproperty([], 2, 1, 2, false, false, "perm", t, false)
                    ->> t -> fast_front(table)
            endunless;
            shift -> fast_apply(ch, t)  ;;; patlen-s-p -> Table(p+1, ch)
        endif;
    endif
enddefine;

define lconstant set_table_formatch(pat, patlen, table) -> t;
    lvars pat, patlen, table, t;

    ;;; This is a special case of set_table when s=patlen.
    ;;; <false> rather than 0 is stored to simplify tests.
    ;;; Result t is the final property, called just before a match.

    lvars tbl, pt = patlen;
    fast_for tbl on table do
        unless (fast_front(tbl) ->> t).isproperty then
            ;;; should this be a property? Probably - seems faster than
            ;;; a closure of an ordinary procedure.
            newproperty([], 2, t, "perm") ->> t -> fast_front(tbl)
        endunless;
        false -> fast_apply(fast_subscrs(pt, pat), t);
        pt fi_- 1 -> pt;
    endfor
enddefine;

define lconstant buildtable(pat, strlen)
        -> (patlen, table, lastproc, successjump, lastchar);
    lvars pat, strlen,
        patlen = datalength(pat),
        table = initl(patlen),
        lastproc, successjump, lastchar;

    ;;; As well as the table proper, returns the information needed to
    ;;; efficiently change the shift returned after a successful match,
    ;;; by doing successjump + strlen -> lastproc(lastchar)

    if isword(pat) then
        word_string(pat) -> pat
    else
        checkstring(pat)
    endif;

    ;;; Set all shifts except 0 (i.e. do not do s=patlen)
    lvars s, shift = patlen;
    fast_for s from 0 to patlen fi_- 1 do
        set_table(shift, pat, s, patlen, table);
        shift fi_- 1 -> shift
    endfor;

    ;;; Do the s=patlen case specially, to store false rather than
    ;;; 0. Also get hold of the last procedure.
    set_table_formatch(pat, patlen, table) -> lastproc;

    ;;; The jump to continue searching after a match should be the
    ;;; same as for a failure on the left-hand character.
    ;;; We add in strlen to ensure that the next index is illegal, and
    ;;; patlen to ensure that the jump itself is also illegal.
    unless patlen == 0 then
        pat(1) -> lastchar;
        (property_default(lastproc) fi_+ patlen ->> successjump)
        fi_+ strlen -> fast_apply(lastchar, lastproc)
    endunless
enddefine;

defclass lconstant quick_substring_table {
    tablen, tabtab, tablast, tabsjump, tablastchar
};

define quick_substring_table(pat) /* -> table_record */;
    lvars pat;
    consquick_substring_table(buildtable(pat, 0))
enddefine;

define lconstant quick_substr(pat, n, startlim, endlim, string, findall)
        -> pos;
    lvars pat, n, startlim, endlim, string, pos, findall;

    findall and [] -> pos;      ;;; pos starts <false> or []
    checkinteger(n, 1, false);
    if isword(string) then
        word_string(string) -> string
    else
        checkstring(string)
    endif;

    lvars patlen, table, strlen = datalength(string);
    if pat.isquick_substring_table then     ;;; restore table
        pat.tablen -> patlen;
        pat.tabtab -> table;
        ;;; table was built with strlen=0 - add strlen to successjump
        unless patlen == 0 then
            pat.tabsjump fi_+ strlen -> (pat.tablast)(pat.tablastchar)
        endunless
    else
        buildtable(pat, strlen) -> (patlen, table, , , )
    endif;

    lvars mxindex = strlen;
    if startlim then
        checkinteger(startlim, 0, false);
        fi_min(startlim fi_+ patlen fi_- 1, mxindex) -> mxindex
    endif;
    if endlim then
        checkinteger(endlim, 0, false);
        fi_min(endlim, mxindex) -> mxindex
    endif;

    lvars
        shift, tbl0, tbl, tablet,
        maxshift = 2 fi_* patlen fi_- 1,    ;;; maximum legal jump
        extrashift = strlen fi_+ patlen,    ;;; added to successjump
        index = n fi_+ patlen fi_- 1;

    if patlen == 0 then         ;;; Do special case of null pattern
        if findall then
            if index == mxindex then
                conspair(n, pos) -> pos
            elseif index fi_< mxindex then
                mishap(0, 'Cannot search for multiple null patterns')
            endif
        else
            if index fi_<= mxindex then
                n -> pos
            endif
        endif

    else        ;;; normal case
        fast_destpair(table) -> (tbl0, table);
        until index fi_> mxindex do      ;;; test for end of string
            until index fi_> mxindex do  ;;; test for success or end of string
                tbl0 -> tbl;
                table -> tablet;
                until fast_apply(fast_subscrs(index, string), tbl) ->> shift do
                    fast_destpair(tablet) -> (tbl, tablet);
                    index fi_- 1 -> index
                enduntil;
                index fi_+ shift -> index
            enduntil;
            if shift fi_> maxshift then
                if findall then
                    conspair(index fi_- shift, pos) -> pos;
                    index fi_- extrashift -> index    ;;; index for next match
                else
                    index fi_- shift -> pos;
                endif
            endif
        enduntil
    endif

enddefine;

define quick_issubstring_lim(/* pat, n, startlim, endlim, string */)
        /* -> pos */ with_nargs 5;
    quick_substr(/* pat etc. */ false)
enddefine;

define quick_substrings_lim(/* pat, n, startlim, endlim, string */)
        /* -> pos */ with_nargs 5;
    quick_substr(/* pat etc. */ true)
enddefine;

define quick_issubstring(pat, string) /* -> pos */;
    lvars pat, n = 1, string;
    if pat.isinteger then
        pat -> (pat, n)
    endif;
    quick_substr(pat, n, false, false, string, false)
enddefine;

define quick_substrings(pat, string) /* -> pos */;
    lvars pat, n = 1, string;
    if pat.isinteger then
        pat -> (pat, n)
    endif;
    quick_substr(pat, n, false, false, string, true)
enddefine;

endsection;

/* --- Revision History ---------------------------------------------------
--- David S Young, Jan  9 1995
        Slightly tidied up in general.
        Calculation of shift after successful match simplified.
        Made to work with null pattern.
--- David S Young, Dec 30 1994
        Algorithm improved to no longer access character preceding
        successful match. Main procedures combined into quick_substr.
        Some comments added.
 */

