/* --- Copyright University of Sussex 1995. All rights reserved. ---------- > File: $poplocal/local/lib/quick_substrings.p > Purpose: Quick searching for patterns in strings > Author: David S Young, Dec 23 1994 (see revisions) > Documentation: HELP * QUICK_SUBSTRINGS */ compile_mode:pop11 +strict; section; define lconstant checkstring(s); lvars s; ;;; Used to check that fast_subscrs can be used. ;;; Next constant will be 8 on current machines lconstant (bytesize, ) = field_spec_info("byte"); unless class_spec(datakey(s)) == bytesize then mishap(dataword(s), 1, 'String or byte vector needed') endunless enddefine; define lconstant set_table(shift, pat, lpt, rpt, table); lvars shift, pat, lpt, rpt, table; ;;; This is called when p characters working left from pat(s) have been ;;; matched with the right hand end of pat. (See the help file.) ;;; shift is the shift for a miss, equal to patlen-lpt. ;;; lpt is the index of the next character to inspect, equal to s-p. ;;; rpt is index of next character to compare, equal to patlen-p. ;;; table is the list of properties, starting with entry p+1. lvars t, ch; if lpt == 0 then ;;; have got back to start of pat - set default fast_for t on table do ;;; for j from s to patlen-1 do shift -> fast_front(t); ;;; patlen-s+j -> Table(j+1,*) shift fi_+ 1 -> shift endfor else if (fast_subscrs(lpt, pat) ->> ch) == fast_subscrs(rpt, pat) then set_table( shift fi_+ 1, pat, lpt fi_- 1, rpt fi_- 1, fast_back(table) ) else fast_front(table) -> t; ;;; either integer default or prop unless t.isproperty then ;;; build prop with given default newanyproperty([], 2, 1, 2, false, false, "perm", t, false) ->> t -> fast_front(table) endunless; shift -> fast_apply(ch, t) ;;; patlen-s-p -> Table(p+1, ch) endif; endif enddefine; define lconstant set_table_formatch(pat, patlen, table) -> t; lvars pat, patlen, table, t; ;;; This is a special case of set_table when s=patlen. ;;; rather than 0 is stored to simplify tests. ;;; Result t is the final property, called just before a match. lvars tbl, pt = patlen; fast_for tbl on table do unless (fast_front(tbl) ->> t).isproperty then ;;; should this be a property? Probably - seems faster than ;;; a closure of an ordinary procedure. newproperty([], 2, t, "perm") ->> t -> fast_front(tbl) endunless; false -> fast_apply(fast_subscrs(pt, pat), t); pt fi_- 1 -> pt; endfor enddefine; define lconstant buildtable(pat, strlen) -> (patlen, table, lastproc, successjump, lastchar); lvars pat, strlen, patlen = datalength(pat), table = initl(patlen), lastproc, successjump, lastchar; ;;; As well as the table proper, returns the information needed to ;;; efficiently change the shift returned after a successful match, ;;; by doing successjump + strlen -> lastproc(lastchar) if isword(pat) then word_string(pat) -> pat else checkstring(pat) endif; ;;; Set all shifts except 0 (i.e. do not do s=patlen) lvars s, shift = patlen; fast_for s from 0 to patlen fi_- 1 do set_table(shift, pat, s, patlen, table); shift fi_- 1 -> shift endfor; ;;; Do the s=patlen case specially, to store false rather than ;;; 0. Also get hold of the last procedure. set_table_formatch(pat, patlen, table) -> lastproc; ;;; The jump to continue searching after a match should be the ;;; same as for a failure on the left-hand character. ;;; We add in strlen to ensure that the next index is illegal, and ;;; patlen to ensure that the jump itself is also illegal. unless patlen == 0 then pat(1) -> lastchar; (property_default(lastproc) fi_+ patlen ->> successjump) fi_+ strlen -> fast_apply(lastchar, lastproc) endunless enddefine; defclass lconstant quick_substring_table { tablen, tabtab, tablast, tabsjump, tablastchar }; define quick_substring_table(pat) /* -> table_record */; lvars pat; consquick_substring_table(buildtable(pat, 0)) enddefine; define lconstant quick_substr(pat, n, startlim, endlim, string, findall) -> pos; lvars pat, n, startlim, endlim, string, pos, findall; findall and [] -> pos; ;;; pos starts or [] checkinteger(n, 1, false); if isword(string) then word_string(string) -> string else checkstring(string) endif; lvars patlen, table, strlen = datalength(string); if pat.isquick_substring_table then ;;; restore table pat.tablen -> patlen; pat.tabtab -> table; ;;; table was built with strlen=0 - add strlen to successjump unless patlen == 0 then pat.tabsjump fi_+ strlen -> (pat.tablast)(pat.tablastchar) endunless else buildtable(pat, strlen) -> (patlen, table, , , ) endif; lvars mxindex = strlen; if startlim then checkinteger(startlim, 0, false); fi_min(startlim fi_+ patlen fi_- 1, mxindex) -> mxindex endif; if endlim then checkinteger(endlim, 0, false); fi_min(endlim, mxindex) -> mxindex endif; lvars shift, tbl0, tbl, tablet, maxshift = 2 fi_* patlen fi_- 1, ;;; maximum legal jump extrashift = strlen fi_+ patlen, ;;; added to successjump index = n fi_+ patlen fi_- 1; if patlen == 0 then ;;; Do special case of null pattern if findall then if index == mxindex then conspair(n, pos) -> pos elseif index fi_< mxindex then mishap(0, 'Cannot search for multiple null patterns') endif else if index fi_<= mxindex then n -> pos endif endif else ;;; normal case fast_destpair(table) -> (tbl0, table); until index fi_> mxindex do ;;; test for end of string until index fi_> mxindex do ;;; test for success or end of string tbl0 -> tbl; table -> tablet; until fast_apply(fast_subscrs(index, string), tbl) ->> shift do fast_destpair(tablet) -> (tbl, tablet); index fi_- 1 -> index enduntil; index fi_+ shift -> index enduntil; if shift fi_> maxshift then if findall then conspair(index fi_- shift, pos) -> pos; index fi_- extrashift -> index ;;; index for next match else index fi_- shift -> pos; endif endif enduntil endif enddefine; define quick_issubstring_lim(/* pat, n, startlim, endlim, string */) /* -> pos */ with_nargs 5; quick_substr(/* pat etc. */ false) enddefine; define quick_substrings_lim(/* pat, n, startlim, endlim, string */) /* -> pos */ with_nargs 5; quick_substr(/* pat etc. */ true) enddefine; define quick_issubstring(pat, string) /* -> pos */; lvars pat, n = 1, string; if pat.isinteger then pat -> (pat, n) endif; quick_substr(pat, n, false, false, string, false) enddefine; define quick_substrings(pat, string) /* -> pos */; lvars pat, n = 1, string; if pat.isinteger then pat -> (pat, n) endif; quick_substr(pat, n, false, false, string, true) enddefine; endsection; /* --- Revision History --------------------------------------------------- --- David S Young, Jan 9 1995 Slightly tidied up in general. Calculation of shift after successful match simplified. Made to work with null pattern. --- David S Young, Dec 30 1994 Algorithm improved to no longer access character preceding successful match. Main procedures combined into quick_substr. Some comments added. */