KMPBC：KMP算法及其改进（kmp with bad character）

前言

最近在看字符串匹配算法，突然灵光一闪有了想法，可以把kmp算法时间效率提高，同时保持最坏时间复杂度O(n+m)不变。其中n为主串长度，m为模式串长度，经测试可以块3-10倍，以为发现了新大陆，但是查阅文献后发现已经有了类似了改进。所以发表在CSDN上就算成功！

算法过程

改进后的KMP算法如下：
首先计算next数组，按照kmp一般方法即可。
其次根据sunday算法计算shift数组。
然后用 $i$ 指向主串s某个字符，j指向模式p中的某个字符，从左到右进行匹配。

如果i和j的字符相等，i,j两个指针同时向右移动。
如果i和j的字符不同，则考虑使用坏字符规则，
- 如果用坏字符规则能使i变大，则根据规则移动i, 并令j=0。
- 否则，按照kmp的方法回退指针j。

public int match(String s, String p) {
    int n = s.length();
    int m = p.length();
    int[] next = buildNext(p);    
    int[] shift = buildShift(p);

    int i = 0;
    int j = 0;
    while(i < n && j < m){
        if(s.charAt(i) == p.charAt(j)){
            i++;           // #1
            j++;
        }else{             // #2
            int bad = i - j + m;
            if(bad < n){
              int nexti = bad - shift[s.charAt(bad)] + 1;
              if(nexti > i){
                i = nexti;
                j = 0;
                continue;
              }
            }
            if(j == 0) { // #3
                i++;
            }else{
                j = next[j-1];
            }
        }
    }
    return j == m ? i - m : -1;
  }

算法分析

如上一节代码：算法总体由if部分和else 两部分组成。
除了#3在else部分位置无论是否使用坏字符规则都会对j进行回退（即将j减小）。我们考虑j的值是什么时候增加的，显然是在#1的时候。但#1和#3执行次数加起来不会超过n, 因此j回退次数也不会超过n。

所以时间复杂度为 $\le 2n = O(n)$ ，
如果加上预处理buildNext()和buildShift()的话， $T (n) = O (n + m)$ 。

实验对比

实验对KMP、BM、Sunday、KMPBC进行了比较，随机生成10000个字符串并随机生成它们的模式串。
四个算法某次运行结果如下，前四行展示了算法的运行时间，最后两个对比了Sunday和KMPBC的比较次数。
在这里插入图片描述
改进后的算法比较次数比sunday更少，且做到了理论上线性。

package org.example;

import java.util.*;

interface IStringMatch {
  int match(String s, String p);
}

class KMP implements IStringMatch {
  
  public static int[] buildNext(String p){
    int n = p.length();
    int[] next = new int[n];  // next[i] 表示 p[0..i] 最长共公前后缀和长度
    int pre = 0; //当前缀长度
    for(int i = 1; i < n; i++){
        if(p.charAt(i) == p.charAt(pre)){
            pre ++;
            next[i] = pre;
        }else{
            if(pre == 0) { // 防上pre-1溢出
                next[i] = 0;
            }else{
                pre = next[pre - 1]; // 在pre之间寻找更小的公共前后缀
                i--; 
            }     
        }
    }
    return next;
  }
  
  public int kmp(String s, String p){
    int n = s.length();
    int m = p.length();
    int[] next = buildNext(p);
    int i = 0;
    int j = 0;
    while(i < n && j < m){
        if(s.charAt(i) == p.charAt(j)){
            i++;
            j++;
        }else{
            if(j == 0) {  // 防上j-1溢出
                i++;
            }else{
                j = next[j-1];
            }
        }
    }
    return j == m ? i - m : -1;
  }

  @Override
  public int match(String s, String p) {
    return kmp(s, p);
  }
}

class BM implements IStringMatch {

  public static int[] buildShift(String p){
    int[] set = new int[256];
    for(int i = 0; i < p.length(); i++){
      set[p.charAt(i)] = (i + 1); // 记录从1开始的位置
    }
    return set;
  }


  @Override
  public int match(String s, String p) {

    // 1. 坏字符规则
    // 2. 好后缀规则
    // 这里直接引用第三方的实现：见附录
    return BMext.indexOf(s.toCharArray(), p.toCharArray());
  }
  
}

class Sunday implements IStringMatch {

  
  static int count = 0;

  @Override
  public int match(String s, String p) {
    int n = s.length();
    int m = p.length();  
    int[] shift = BM.buildShift(p);
    int i = 0;
    int j = 0;
    while(i < n && j < m){
      count ++;
      if(s.charAt(i) == p.charAt(j)){
        i ++;
        j ++;
      }else{
        int bad = i - j + m;
        if(bad < n){
          i += m - (shift[s.charAt(bad)] - 1);
          i -= j;
          j = 0;
        }else{
          return -1;
        }
      }
    }
    return j == m ? i - m : -1;
  }
  
}

class KMPWithBC implements IStringMatch {

  static int count = 0;
  
  @Override
  public int match(String s, String p) {
    int n = s.length();
    int m = p.length();
    int[] next = KMP.buildNext(p);    
    int[] shift = BM.buildShift(p);

    int i = 0;
    int j = 0;
    while(i < n && j < m){
        count ++;
        // 环字符规则加在这也行
        // int bad = i + (m - j) - 1;
        // if(bad < n && shift[s.charAt(bad)] == 0){
        //   i = bad + 1;
        //   j = 0;
        //   continue;
        // }
        if(s.charAt(i) == p.charAt(j)){
            i++;
            j++;
        }else{
            int bad2 = i - j + m;
            if(bad2 < n){
              int nexti = bad2 - shift[s.charAt(bad2)] + 1;
              if(nexti > i){
                i = nexti;
                j = 0;
                continue;
              }
            }
            if(j == 0) {  // 防上j-1溢出
                i++;
            }else{
                j = next[j-1];
            }
        }
    }
    return j == m ? i - m : -1;
  }

}


public class Main{

  
  static Random random=new Random();

  public static String getRandomString(int length){
    String str="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
      StringBuffer sb=new StringBuffer();
      for(int i=0;i<length;i++){
        int number=random.nextInt(5); // 62
        sb.append(str.charAt(number));
      }
      return sb.toString();
  }

  
  public static void main(String[] args){
    int n = 100000;
    String[] words = new String[n];
    String[] patts = new String[n];
    for(int i = 0; i < n; i++){
      String s = getRandomString(10000);
      words[i] = s;
      int pos = random.nextInt(s.length());
      int len = random.nextInt(128) + 1;
      patts[i] = s.substring(pos , Math.min(s.length(), pos + len));
    }

    // algorithms
    IStringMatch[] algs = new IStringMatch[]{
      new KMP(),
      new BM(),
      new Sunday(),
      new KMPWithBC()
    };
    // answers
    int[] ans = new int[n];
    for(int i = 0; i < n; i++){
      ans[i] = words[i].indexOf(patts[i]);
    }
    for(var al : algs){
      Date t1 = new Date();
      for(int i = 0; i < n; i++){
        int an = al.match(words[i], patts[i]);
        if(an != ans[i]) {
          System.out.println(al + ":" + words[i] + " matches " + patts[i] + " eqauls " + ans[i] + " but is" + an);
          System.exit(-1);
        }
      }
      Date t2 = new Date();
      System.out.println(al + ":" + ((t2.getTime() - t1.getTime())) + "ms");  
    }

    System.out.println("sunday: " + Sunday.count);
    System.out.println("kmpbc: " + KMPWithBC.count);
  }
}

附录

BM算法实现

package org.example;

class BMext {

      /**
     * Returns the index within this string of the first occurrence of the
     * specified substring. If it is not a substring, return -1.
     *
     * There is no Galil because it only generates one match.
     *
     * @param haystack The string to be scanned
     * @param needle The target string to search
     * @return The start index of the substring
     */
    public static int indexOf(char[] haystack, char[] needle) {
        if (needle.length == 0) {
            return 0;
        }
        int charTable[] = makeCharTable(needle);
        int offsetTable[] = makeOffsetTable(needle);
        for (int i = needle.length - 1, j; i < haystack.length;) {
            for (j = needle.length - 1; needle[j] == haystack[i]; --i, --j) {
                if (j == 0) {
                    return i;
                }
            }
            // i += needle.length - j; // For naive method
            i += Math.max(offsetTable[needle.length - 1 - j], charTable[haystack[i]]);
        }
        return -1;
    }

    /**
     * Makes the jump table based on the mismatched character information.
     */
    private static int[] makeCharTable(char[] needle) {
        //final int ALPHABET_SIZE = Character.MAX_VALUE + 1; // 65536
        final int ALPHABET_SIZE = 256;
        int[] table = new int[ALPHABET_SIZE];
        for (int i = 0; i < table.length; ++i) {
            table[i] = needle.length;
        }
        for (int i = 0; i < needle.length; ++i) {
            table[needle[i]] = needle.length - 1 - i;
        }
        return table;
    }

    /**
     * Makes the jump table based on the scan offset which mismatch occurs.
     * (bad-character rule).
     */
    private static int[] makeOffsetTable(char[] needle) {
        int[] table = new int[needle.length];
        int lastPrefixPosition = needle.length;
        for (int i = needle.length; i > 0; --i) {
            if (isPrefix(needle, i)) {
                lastPrefixPosition = i;
            }
            table[needle.length - i] = lastPrefixPosition - i + needle.length;
        }
        for (int i = 0; i < needle.length - 1; ++i) {
            int slen = suffixLength(needle, i);
            table[slen] = needle.length - 1 - i + slen;
        }
        return table;
    }

    /**
     * Is needle[p:end] a prefix of needle?
     */
    private static boolean isPrefix(char[] needle, int p) {
        for (int i = p, j = 0; i < needle.length; ++i, ++j) {
            if (needle[i] != needle[j]) {
                return false;
            }
        }
        return true;
    }

    /**
     * Returns the maximum length of the substring ends at p and is a suffix.
     * (good-suffix rule)
     */
    private static int suffixLength(char[] needle, int p) {
        int len = 0;
        for (int i = p, j = needle.length - 1;
                 i >= 0 && needle[i] == needle[j]; --i, --j) {
            len += 1;
        }
        return len;
    }
}