-- Copyright (c) 1991-2002, The Numerical ALgorithms Group Ltd.
-- All rights reserved.
-- Copyright (C) 2007-2010, Gabriel Dos Reis.
-- All rights reserved.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are
-- met:
--
--     - Redistributions of source code must retain the above copyright
--       notice, this list of conditions and the following disclaimer.
--
--     - Redistributions in binary form must reproduce the above copyright
--       notice, this list of conditions and the following disclaimer in
--       the documentation and/or other materials provided with the
--       distribution.
--
--     - Neither the name of The Numerical ALgorithms Group Ltd. nor the
--       names of its contributors may be used to endorse or promote products
--       derived from this software without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-- TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-- OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


$minThreshold := 3
$maxThreshold := 7
 
--=======================================================================
--                        Build Directories
--=======================================================================
buildOperationWordTable() ==
  $opWordTable := buildWordTable [PNAME x for x in allOperations()]
 
buildWordTable u ==
  table:= hashTable 'EQ
  for s in u repeat
    words := wordsOfString s
    key := UPCASE s.0
    HPUT(table,key,[[s,:words],:HGET(table,key)])
  for key in HKEYS table repeat
    HPUT(table,key,
      listSort(function GLESSEQP,removeDupOrderedAlist
        listSort(function GLESSEQP, HGET(table,key),function first),
          function second))
  table

measureWordTable u ==
  +/[+/[#entry for entry in HGET(u,key)] for key in HKEYS u]

removeDupOrderedAlist u ==
  -- removes duplicate entries in ordered alist
  -- (where duplicates are adjacent)
  for x in tails u repeat
    (y := rest x) and first first x = first first y => x.rest := rest y
  u
 
wordsOfString(s) == [UPCASE x for x in wordsOfStringKeepCase s]
 
wordsOfStringKeepCase s == wordsOfString1(s,0) or [COPY s]
 
wordsOfString1(s,j) ==
  k := or/[i for i in j..(MAXINDEX(s)-1) | upperCase? s.i] =>
    tailWords:=
      upperCase? s.(k+1) =>
        n:= or/[i for i in (k+2)..(MAXINDEX(s)-1)|not upperCase? s.i]
        null n => [SUBSTRING(s,k,nil)]
        n > k+1 => [SUBSTRING(s,k,n-k-1),:wordsOfString1(s,n-1)]
      m := or/[i for i in (k+2)..(MAXINDEX(s)-1) | upperCase? s.i] =>
        [SUBSTRING(s,k,m-k),:wordsOfString1(s,m)]
      [SUBSTRING(s,k,nil)]
    k > j+1 => [SUBSTRING(s,j,k-j),:tailWords]
    tailWords
  nil

wordKeys s == 
  removeDuplicates [UPCASE s.0,:fn(s,1,-1,MAXINDEX s,nil)] where fn(s,i,lastKeyIndex,n,acc) ==
    i > n => acc
    upperCase? s.i =>
--    i = lastKeyIndex + 1 => fn(s,i + 1,i,n,[s.i,:rest acc])
      fn(s,i + 1,i,n,[s.i,:acc])
    fn(s,i + 1,lastKeyIndex,n,acc)

--=======================================================================
--               Augment Function Directories
--=======================================================================
add2WordFunctionTable fn ==
--called from DEF
  $functionTable and
    null LASSOC(s := PNAME fn,HGET($functionTable,(key := UPCASE s.0))) =>
      HPUT($functionTable,key,[[s,:wordsOfString s],:HGET($functionTable,key)])
 
--=======================================================================
--                       Guess Function Name
--=======================================================================
findWords(word,table) == 
  $lastWord := word
  $lastTable:= table
  $totalWords:= nil
  $countThreshold := $minThreshold
  $lastMinimum := -1
  res := findApproximateWords(word,table) 
  if null res then
    $countThreshold := $countThreshold + 2
    res := findApproximateWords(word,table) 
  $lastAlist := mySort res =>
--    $lastMinimum := first LAST $lastAlist
--    $lastWords := wordSort CDAR $lastAlist
--    $totalWords:= $lastWords
--    $lastAlist := rest  $lastAlist
--    $totalWords
      $lastMinimum := CAAR $lastAlist
      $lastWords := wordSort CDAR $lastAlist
      $totalWords:= $lastWords
      $lastAlist := rest  $lastAlist
      $totalWords
  $lastWords := nil

wordSort u == removeDuplicates listSort(function GLESSEQP,u)

more() == moreWords($lastWord,$lastTable)

moreWords(word,table) ==
  $lastAlist =>
     $lastMinimum := first LAST pp $lastAlist
     numberOfLastWords := #$lastWords
     $lastWords := "append"/(ASSOCRIGHT $lastAlist)
     if #$lastWords > numberOfLastWords then 
       trialLastAlist := 
         [p for p in $lastAlist | p.0 < $maxThreshold]
       trialLastWords := "append"/(ASSOCRIGHT trialLastAlist)
       if #trialLastWords > numberOfLastWords then
         $lastWords := trialLastWords         
     $totalWords:= wordSort [:$lastWords,:$totalWords]
     $lastAlist := nil
     $totalWords
  $countThreshold := $countThreshold + 2
  $lastAlist := findApproximateWords(word,table)
  moreWords(word,table)

findApproximateWords(word,table) ==
  count := $countThreshold
  words:= wordsOfString word
  upperWord:= UPCASE COPY word
  n := #words
  threshold:=
    n = 1 => count
    count+1
 
  --first try to break up as list of words
  alist:= nil
  for i in 1..#words repeat
    $penalty :local := (i = 1 => 0; 1)
    wordAlist:= HGET(table,UPCASE (first words).0)
    for [x,:wordList] in wordAlist repeat
      k := findApproxWordList(words,wordList,n,threshold,#wordList) 
      k =>
        k := k + $penalty
        k <= $lastMinimum => 'skip
        alist := consAlist(k,x,alist)
      
    if i = 1 and null alist then
        --no winners, so try flattening to upper case and checking again
        wordSize := SIZE word
        lastThreshold := MAX(threshold - 1,wordSize/2)
        for [x,:.] in wordAlist repeat
          k := deltaWordEntry(upperWord,UPCASE x)
          k < lastThreshold => alist := consAlist(k,x,alist)

    rotateWordList words

  alist

consAlist(x,y,alist) ==
  u := ASSOC(x,alist) => 
    u.rest := [y,:rest u]
    alist
  [[x,y],:alist]

findApproxWordList(words,wordList,n,threshold,w) ==
  val := findApproxWordList1(words,wordList,n,threshold,w)
  null val => val
--pp [val,:wordList]
  val

findApproxWordList1(words,wordList,n,threshold,w) ==
  two := threshold - 2
  n = w => 
      k := findApproxSimple(words,wordList,threshold) => k 

      n < 3 => false
      threshold := threshold - 1
      sum := 0      --next, throw out one bad word

      badWord := false
      for entry in wordList for part in words while sum < threshold repeat
        k:= deltaWordEntry(part,entry)
        k < two => sum:= sum + k
        null badWord => badWord := true
        sum := 1000
      sum < threshold => 
--      pp [2,sum,wordList]
        sum + 2

  n+1 = w =>    --assume one word is missing
      sum := 0
      badWord := false
      for entries in tails wordList for part in words
           while sum < threshold repeat
        entry := first entries
        k:= deltaWordEntry(part,entry)
        k < two => sum:= sum + k
        null badWord =>
          badWord := true
          entries := rest entries      --skip this bad word
          entry := first entries
          k := deltaWordEntry(part,entry)
          k < two => sum := sum + k
          sum := 1000
        sum := 1000
      sum < threshold => 
--      pp [3,sum,wordList]
        sum + 2
      false
  n-1 = w =>    --assume one word too many
      sum := 0  --here: KEEP it hard to satisfy
      badWord := false
      for entry in wordList for parts in tails words
           while sum < threshold repeat
        part := first parts
        k:= deltaWordEntry(part,entry)
        k < 2 => sum:= sum + k
        null badWord =>
          badWord := true
          parts := rest parts      --skip this bad word
          part := first parts
          k := deltaWordEntry(part,entry)
          k < 2 => sum := sum + k
          return (sum := 1000)
        return (sum := 1000)
      sum < threshold =>
--      pp [4,sum,wordList]
        $penalty = 1 => sum
        sum + 1
      false
  false

findApproxSimple(words,wordList,threshold) ==
  sum := 0
  --first try matching words in order
  for entry in wordList for part in words while sum < threshold repeat
    sum:= sum + deltaWordEntry(part,entry)
  sum < threshold => 
--  pp ['"--->",sum,:wordList]
    sum
  nil
      
rotateWordList u ==
  v := u
  p := first v
  while rest v repeat
    v.first := second v
    v := rest v
  v.first := p
  u

deltaWordEntry(word,entry) ==
  word = entry => 0
  word.0 ~= entry.0 => 1000
  #word > 2 and stringPrefix?(word,entry) => 1
  ABS(diff := SIZE word - SIZE entry) > 4 => 1000
  canForgeWord(word,entry)
 
--+ Note these are optimized definitions below-- see commented out versions
--+   to understand the algorithm
canForgeWord(word,entry) ==
  forge(word,0,MAXINDEX word,entry,0,MAXINDEX entry,0)
 
forge(word,w,W,entry,e,E,n) ==
  w > W =>
    e > E => n
    QSADD1 QSPLUS(E-e,n)
  e > E => QSADD1 QSPLUS(W-w,n)
  word.w = entry.e => forge(word,w+1,W,entry,e+1,E,n)
  w=W or e=E => forge(word,w+1,W,entry,e+1,E,QSADD1 n)
  word.w=entry.(e+1) =>
    word.(w+1) = entry.e => forge(word,w+2,W,entry,e+2,E,QSADD1 n)
    forge(word,w+1,W,entry,e+2,E,QSADD1 n)
  word.(w+1)=entry.e => forge(word,w+2,W,entry,e+1,E,QSADD1 n)
 
  (deltaW := W-w) > 1 and (deltaE := E-e) > 1 =>
    --if word is long, can we delete chars to match 2 consective chars
    deltaW >= deltaE and
      (k := or/[j for j in (w+2)..(W-1) | word.j = entry.e])
        and word.(k+1) = entry.(e+1) =>
          forge(word,k+2,W,entry,e+2,E,QSPLUS(k-w,n))
    deltaW <= deltaE and
    --if word is short, can we insert chars so as to match 2 consecutive chars
      (k := or/[j for j in (e+2)..(E-1) | word.w = entry.j])
        and word.(w+1) = entry.(k+1) =>
          forge(word,w+2,W,entry,k+2,E,QSPLUS(n,k-e))
    forge(word,w+1,W,entry,e+1,E,QSADD1 n)
  --check for two consecutive matches down the line
  forge(word,w+1,W,entry,e+1,E,QSADD1 n)
 
--+ DO NOT REMOVE DEFINITIONS BELOW which explain the algorithm
--+ canForgeWord(word,entry) ==--
--+ [d,i,s,t] := forge(word,0,MAXINDEX word,entry,0,MAXINDEX entry,0,0,0,0)
--+ --d=deletions, i=insertions, s=substitutions, t=transpositions
--+ --list is formed only for tuning purposes-- remove later on
--+ d + i + s + t
 
--+forge(word,w,W,entry,e,E,d,i,s,t) ==
--+  w > W =>
--+    e > E => [d,i,s,t]
--+    [d,E-e+i+1,s,t]
--+  e > E => [W-w+d+1,i,s,t]
--+  word.w = entry.e => forge(word,w+1,W,entry,e+1,E,d,i,s,t)
--+  w=W or e=E => forge(word,w+1,W,entry,e+1,E,d,i,s+1,t)
--+  word.w=entry.(e+1) =>
--+    word.(w+1) = entry.e => forge(word,w+2,W,entry,e+2,E,d,i,s,t+1)
--+    forge(word,w+1,W,entry,e+2,E,d,i+1,s,t)
--+  word.(w+1)=entry.e => forge(word,w+2,W,entry,e+1,E,d+1,i,s,t)
--+
--+  (deltaW := W-w) > 1 and (deltaE := E-e) > 1 =>
--+    --if word is long, can we delete chars to match 2 consective chars
--+    deltaW >= deltaE and
--+      (k := or/[j for j in (w+2)..(W-1) | word.j = entry.e])
--+        and word.(k+1) = entry.(e+1) =>
--+          forge(word,k+2,W,entry,e+2,E,d+k-w,i,s,t)
--+    deltaW <= deltaE and
--+    --if word is short, can we insert chars so as to match 2 consecutive chars
--+      (k := or/[j for j in (e+2)..(E-1) | word.w = entry.j])
--+        and word.(w+1) = entry.(k+1) =>
--+          forge(word,w+2,W,entry,k+2,E,d,i+k-e,s,t)
--+    forge(word,w+1,W,entry,e+1,E,d,i,s+1,t)
--+  --check for two consecutive matches down the line
--+  forge(word,w+1,W,entry,e+1,E,d,i,s+1,t)
 
mySort u == listSort(function GLESSEQP,u)