aboutsummaryrefslogtreecommitdiff
path: root/src/interp/guess.boot
diff options
context:
space:
mode:
Diffstat (limited to 'src/interp/guess.boot')
-rw-r--r--src/interp/guess.boot349
1 files changed, 349 insertions, 0 deletions
diff --git a/src/interp/guess.boot b/src/interp/guess.boot
new file mode 100644
index 00000000..1aeefc88
--- /dev/null
+++ b/src/interp/guess.boot
@@ -0,0 +1,349 @@
+-- Copyright (c) 1991-2002, The Numerical ALgorithms Group Ltd.
+-- All rights reserved.
+-- Copyright (C) 2007, Gabriel Dos Reis.
+-- All rights reserved.
+--
+-- Redistribution and use in source and binary forms, with or without
+-- modification, are permitted provided that the following conditions are
+-- met:
+--
+-- - Redistributions of source code must retain the above copyright
+-- notice, this list of conditions and the following disclaimer.
+--
+-- - Redistributions in binary form must reproduce the above copyright
+-- notice, this list of conditions and the following disclaimer in
+-- the documentation and/or other materials provided with the
+-- distribution.
+--
+-- - Neither the name of The Numerical ALgorithms Group Ltd. nor the
+-- names of its contributors may be used to endorse or promote products
+-- derived from this software without specific prior written permission.
+--
+-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+-- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+-- TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+-- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+-- OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+-- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+-- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+$minThreshold := 3
+$maxThreshold := 7
+
+--=======================================================================
+-- Build Directories
+--=======================================================================
+buildOperationWordTable() ==
+ $opWordTable := buildWordTable [PNAME x for x in allOperations()]
+
+buildWordTable u ==
+ table:= MAKE_-HASHTABLE 'ID
+ for s in u repeat
+ words := wordsOfString s
+ key := UPCASE s.0
+ HPUT(table,key,[[s,:words],:HGET(table,key)])
+ for key in HKEYS table repeat
+ HPUT(table,key,
+ listSort(function GLESSEQP,removeDupOrderedAlist
+ listSort(function GLESSEQP, HGET(table,key),function CAR),
+ function CADR))
+ table
+
+measureWordTable u ==
+ +/[+/[#entry for entry in HGET(u,key)] for key in HKEYS u]
+
+removeDupOrderedAlist u ==
+ -- removes duplicate entries in ordered alist
+ -- (where duplicates are adjacent)
+ for x in tails u repeat
+ (y := rest x) and first first x = first first y => RPLACD(x,rest y)
+ u
+
+wordsOfString(s) == [UPCASE x for x in wordsOfStringKeepCase s]
+
+wordsOfStringKeepCase s == wordsOfString1(s,0) or [COPY s]
+
+wordsOfString1(s,j) ==
+ k := or/[i for i in j..SUB1(MAXINDEX(s)) | UPPER_-CASE_-P s.i] =>
+ tailWords:=
+ UPPER_-CASE_-P s.(k+1) =>
+ n:= or/[i for i in (k+2)..SUB1(MAXINDEX(s))|not UPPER_-CASE_-P s.i]
+ null n => [SUBSTRING(s,k,nil)]
+ n > k+1 => [SUBSTRING(s,k,n-k-1),:wordsOfString1(s,n-1)]
+ m := or/[i for i in (k+2)..SUB1(MAXINDEX(s)) | UPPER_-CASE_-P s.i] =>
+ [SUBSTRING(s,k,m-k),:wordsOfString1(s,m)]
+ [SUBSTRING(s,k,nil)]
+ k > j+1 => [SUBSTRING(s,j,k-j),:tailWords]
+ tailWords
+ nil
+
+wordKeys s ==
+ REMDUP [UPCASE s.0,:fn(s,1,-1,MAXINDEX s,nil)] where fn(s,i,lastKeyIndex,n,acc) ==
+ i > n => acc
+ UPPER_-CASE_-P s.i =>
+-- i = lastKeyIndex + 1 => fn(s,i + 1,i,n,[s.i,:rest acc])
+ fn(s,i + 1,i,n,[s.i,:acc])
+ fn(s,i + 1,lastKeyIndex,n,acc)
+
+--=======================================================================
+-- Augment Function Directories
+--=======================================================================
+add2WordFunctionTable fn ==
+--called from DEF
+ $functionTable and
+ null LASSOC(s := PNAME fn,HGET($functionTable,(key := UPCASE s.0))) =>
+ HPUT($functionTable,key,[[s,:wordsOfString s],:HGET($functionTable,key)])
+
+--=======================================================================
+-- Guess Function Name
+--=======================================================================
+findWords(word,table) ==
+ $lastWord := word
+ $lastTable:= table
+ $totalWords:= nil
+ $countThreshold := $minThreshold
+ $lastMinimum := -1
+ res := findApproximateWords(word,table)
+ if null res then
+ $countThreshold := $countThreshold + 2
+ res := findApproximateWords(word,table)
+ $lastAlist := mySort res =>
+-- $lastMinimum := CAR LAST $lastAlist
+-- $lastWords := wordSort CDAR $lastAlist
+-- $totalWords:= $lastWords
+-- $lastAlist := CDR $lastAlist
+-- $totalWords
+ $lastMinimum := CAAR $lastAlist
+ $lastWords := wordSort CDAR $lastAlist
+ $totalWords:= $lastWords
+ $lastAlist := CDR $lastAlist
+ $totalWords
+ $lastWords := nil
+
+wordSort u == REMDUP listSort(function GLESSEQP,u)
+
+more() == moreWords($lastWord,$lastTable)
+
+moreWords(word,table) ==
+ $lastAlist =>
+ $lastMinimum := CAR LAST pp $lastAlist
+ numberOfLastWords := #$lastWords
+ $lastWords := "append"/(ASSOCRIGHT $lastAlist)
+ if #$lastWords > numberOfLastWords then
+ trialLastAlist :=
+ [p for p in $lastAlist | p.0 < $maxThreshold]
+ trialLastWords := "append"/(ASSOCRIGHT trialLastAlist)
+ if #trialLastWords > numberOfLastWords then
+ $lastWords := trialLastWords
+ $totalWords:= wordSort [:$lastWords,:$totalWords]
+ $lastAlist := nil
+ $totalWords
+ $countThreshold := $countThreshold + 2
+ $lastAlist := findApproximateWords(word,table)
+ moreWords(word,table)
+
+findApproximateWords(word,table) ==
+ count := $countThreshold
+ words:= wordsOfString word
+ upperWord:= UPCASE COPY word
+ n := #words
+ threshold:=
+ n = 1 => count
+ count+1
+
+ --first try to break up as list of words
+ alist:= nil
+ for i in 1..#words repeat
+ $penalty :local := (i = 1 => 0; 1)
+ wordAlist:= HGET(table,UPCASE (first words).0)
+ for [x,:wordList] in wordAlist repeat
+ k := findApproxWordList(words,wordList,n,threshold,#wordList)
+ k =>
+ k := k + $penalty
+ k <= $lastMinimum => 'skip
+ alist := consAlist(k,x,alist)
+
+ if i = 1 and null alist then
+ --no winners, so try flattening to upper case and checking again
+ wordSize := SIZE word
+ lastThreshold := MAX(threshold - 1,wordSize/2)
+ for [x,:.] in wordAlist repeat
+ k := deltaWordEntry(upperWord,UPCASE x)
+ k < lastThreshold => alist := consAlist(k,x,alist)
+
+ rotateWordList words
+
+ alist
+
+consAlist(x,y,alist) ==
+ u := ASSOC(x,alist) =>
+ RPLACD(u,[y,:CDR u])
+ alist
+ [[x,y],:alist]
+
+findApproxWordList(words,wordList,n,threshold,w) ==
+ val := findApproxWordList1(words,wordList,n,threshold,w)
+ null val => val
+--pp [val,:wordList]
+ val
+
+findApproxWordList1(words,wordList,n,threshold,w) ==
+ two := threshold - 2
+ n = w =>
+ k := findApproxSimple(words,wordList,threshold) => k
+
+ n < 3 => false
+ threshold := threshold - 1
+ sum := 0 --next, throw out one bad word
+
+ badWord := false
+ for entry in wordList for part in words while sum < threshold repeat
+ k:= deltaWordEntry(part,entry)
+ k < two => sum:= sum + k
+ null badWord => badWord := true
+ sum := 1000
+ sum < threshold =>
+-- pp [2,sum,wordList]
+ sum + 2
+
+ n+1 = w => --assume one word is missing
+ sum := 0
+ badWord := false
+ for entries in tails wordList for part in words
+ while sum < threshold repeat
+ entry := first entries
+ k:= deltaWordEntry(part,entry)
+ k < two => sum:= sum + k
+ null badWord =>
+ badWord := true
+ entries := rest entries --skip this bad word
+ entry := first entries
+ k := deltaWordEntry(part,entry)
+ k < two => sum := sum + k
+ sum := 1000
+ sum := 1000
+ sum < threshold =>
+-- pp [3,sum,wordList]
+ sum + 2
+ false
+ n-1 = w => --assume one word too many
+ sum := 0 --here: KEEP it hard to satisfy
+ badWord := false
+ for entry in wordList for parts in tails words
+ while sum < threshold repeat
+ part := first parts
+ k:= deltaWordEntry(part,entry)
+ k < 2 => sum:= sum + k
+ null badWord =>
+ badWord := true
+ parts := rest parts --skip this bad word
+ part := first parts
+ k := deltaWordEntry(part,entry)
+ k < 2 => sum := sum + k
+ return (sum := 1000)
+ return (sum := 1000)
+ sum < threshold =>
+-- pp [4,sum,wordList]
+ $penalty = 1 => sum
+ sum + 1
+ false
+ false
+
+findApproxSimple(words,wordList,threshold) ==
+ sum := 0
+ --first try matching words in order
+ for entry in wordList for part in words while sum < threshold repeat
+ sum:= sum + deltaWordEntry(part,entry)
+ sum < threshold =>
+-- pp ['"--->",sum,:wordList]
+ sum
+ nil
+
+rotateWordList u ==
+ v := u
+ p := CAR v
+ while QCDR v repeat
+ RPLACA(v,CADR v)
+ v := QCDR v
+ RPLACA(v,p)
+ u
+
+deltaWordEntry(word,entry) ==
+ word = entry => 0
+ word.0 ^= entry.0 => 1000
+ #word > 2 and stringPrefix?(word,entry) => 1
+ ABS(diff := SIZE word - SIZE entry) > 4 => 1000
+ canForgeWord(word,entry)
+
+--+ Note these are optimized definitions below-- see commented out versions
+--+ to understand the algorithm
+canForgeWord(word,entry) ==
+ forge(word,0,MAXINDEX word,entry,0,MAXINDEX entry,0)
+
+forge(word,w,W,entry,e,E,n) ==
+ w > W =>
+ e > E => n
+ QSADD1 QSPLUS(E-e,n)
+ e > E => QSADD1 QSPLUS(W-w,n)
+ word.w = entry.e => forge(word,w+1,W,entry,e+1,E,n)
+ w=W or e=E => forge(word,w+1,W,entry,e+1,E,QSADD1 n)
+ word.w=entry.(e+1) =>
+ word.(w+1) = entry.e => forge(word,w+2,W,entry,e+2,E,QSADD1 n)
+ forge(word,w+1,W,entry,e+2,E,QSADD1 n)
+ word.(w+1)=entry.e => forge(word,w+2,W,entry,e+1,E,QSADD1 n)
+
+ (deltaW := W-w) > 1 and (deltaE := E-e) > 1 =>
+ --if word is long, can we delete chars to match 2 consective chars
+ deltaW >= deltaE and
+ (k := or/[j for j in (w+2)..(W-1) | word.j = entry.e])
+ and word.(k+1) = entry.(e+1) =>
+ forge(word,k+2,W,entry,e+2,E,QSPLUS(k-w,n))
+ deltaW <= deltaE and
+ --if word is short, can we insert chars so as to match 2 consecutive chars
+ (k := or/[j for j in (e+2)..(E-1) | word.w = entry.j])
+ and word.(w+1) = entry.(k+1) =>
+ forge(word,w+2,W,entry,k+2,E,QSPLUS(n,k-e))
+ forge(word,w+1,W,entry,e+1,E,QSADD1 n)
+ --check for two consecutive matches down the line
+ forge(word,w+1,W,entry,e+1,E,QSADD1 n)
+
+--+ DO NOT REMOVE DEFINITIONS BELOW which explain the algorithm
+--+ canForgeWord(word,entry) ==--
+--+ [d,i,s,t] := forge(word,0,MAXINDEX word,entry,0,MAXINDEX entry,0,0,0,0)
+--+ --d=deletions, i=insertions, s=substitutions, t=transpositions
+--+ --list is formed only for tuning purposes-- remove later on
+--+ d + i + s + t
+
+--+forge(word,w,W,entry,e,E,d,i,s,t) ==
+--+ w > W =>
+--+ e > E => [d,i,s,t]
+--+ [d,E-e+i+1,s,t]
+--+ e > E => [W-w+d+1,i,s,t]
+--+ word.w = entry.e => forge(word,w+1,W,entry,e+1,E,d,i,s,t)
+--+ w=W or e=E => forge(word,w+1,W,entry,e+1,E,d,i,s+1,t)
+--+ word.w=entry.(e+1) =>
+--+ word.(w+1) = entry.e => forge(word,w+2,W,entry,e+2,E,d,i,s,t+1)
+--+ forge(word,w+1,W,entry,e+2,E,d,i+1,s,t)
+--+ word.(w+1)=entry.e => forge(word,w+2,W,entry,e+1,E,d+1,i,s,t)
+--+
+--+ (deltaW := W-w) > 1 and (deltaE := E-e) > 1 =>
+--+ --if word is long, can we delete chars to match 2 consective chars
+--+ deltaW >= deltaE and
+--+ (k := or/[j for j in (w+2)..(W-1) | word.j = entry.e])
+--+ and word.(k+1) = entry.(e+1) =>
+--+ forge(word,k+2,W,entry,e+2,E,d+k-w,i,s,t)
+--+ deltaW <= deltaE and
+--+ --if word is short, can we insert chars so as to match 2 consecutive chars
+--+ (k := or/[j for j in (e+2)..(E-1) | word.w = entry.j])
+--+ and word.(w+1) = entry.(k+1) =>
+--+ forge(word,w+2,W,entry,k+2,E,d,i+k-e,s,t)
+--+ forge(word,w+1,W,entry,e+1,E,d,i,s+1,t)
+--+ --check for two consecutive matches down the line
+--+ forge(word,w+1,W,entry,e+1,E,d,i,s+1,t)
+
+mySort u == listSort(function GLESSEQP,u)