博客
关于我
Java实现哈希(相似度)算法,用于试题相似度,字符串相似度等场景
阅读量:117 次
发布时间:2019-02-27

本文共 4579 字,大约阅读时间需要 15 分钟。

??????????????????????????????????????????????????????????????????????????????????????????????????????????

???????????

??????????Java?HashMap?????????????????????????????????

  • ????????????????????
  • ???????????????????????????51???????100101?101011?
  • ?????????????????2????????????
  • ?????????????????????????
  • ??????????????????????????????simhash???
  • ??????????????????????????????????????????


    ???????????

    ???????????????????????

  • ???????HTML???????????????????
  • ???????????????????????????
  • ???????????????????????????????????????
  • ?????????????????????????????
  • ???????
    • ????????????????????
    • ???????AND?XOR?????????????
    • ?????????????????????
  • ???????

    public class SimHashAlgorithm {
    private String tokens;
    private BigInteger strSimHash;
    private int hashbits = 64;
    public SimHashAlgorithm(String tokens) {
    this.tokens = tokens;
    this.strSimHash = computeSimHash(tokens);
    }
    private BigInteger computeSimHash(String tokens) {
    tokens = cleanResume(tokens);
    int[] v = new int[hashbits];
    List
    termList = StandardTokenizer.segment(tokens);
    Map
    wordCount = new HashMap<>();
    Map
    stopNatures = new HashMap<>();
    stopNatures.put("w", "");
    wordCount.put("n", 2);
    for (Term term : termList) {
    String word = term.word;
    String nature = term.nature.toString();
    if (stopNatures.containsKey(nature)) continue;
    if (wordCount.containsKey(word)) {
    int count = wordCount.get(word);
    if (count > 5) continue;
    wordCount.put(word, count + 1);
    } else {
    wordCount.put(word, 1);
    }
    BigInteger t = hash(word);
    for (int i = 0; i < hashbits; i++) {
    BigInteger bitmask = BigInteger.ONE.shiftLeft(i);
    if (t.and(bitmask).signum() != 0) {
    v[i] += wordCount.get(word) * weightOfNature.getOrDefault(nature, 1);
    } else {
    v[i] -= wordCount.get(word) * weightOfNature.getOrDefault(nature, 1);
    }
    }
    }
    BigInteger fingerprint = BigInteger.ZERO;
    for (int i = 0; i < hashbits; i++) {
    if (v[i] > 0) {
    fingerprint = fingerprint.add(BigInteger.ONE.shiftLeft(i));
    }
    }
    return fingerprint;
    }
    private BigInteger hash(String source) {
    if (source == null || source.isEmpty()) return BigInteger.ZERO;
    StringBuilder sb = new StringBuilder();
    while (sb.length() < 64) {
    sb.append(source.charAt(0));
    source = source.substring(1);
    }
    source = sb.toString();
    char[] chars = source.toCharArray();
    BigInteger x = BigInteger.valueOf((long) chars[0] << 7);
    BigInteger m = new BigInteger("1000003");
    BigInteger mask = m.pow(hashbits).subtract(BigInteger.ONE);
    for (char c : chars) {
    BigInteger temp = BigInteger.valueOf((long) c);
    x = x.multiply(m).xor(temp).and(mask);
    }
    x = x.xor(BigInteger.valueOf(source.length()));
    return x.equals(BigInteger.valueOf(-1)) ? BigInteger.valueOf(-2) : x;
    }
    public int hammingDistance(SimHashAlgorithm other) {
    BigInteger m = BigInteger.ONE.shiftLeft(hashbits).subtract(BigInteger.ONE);
    BigInteger x = strSimHash.xor(other.strSimHash).and(m);
    int distance = 0;
    while (x.signum() != 0) {
    distance++;
    x = x.and(x.subtract(BigInteger.ONE));
    }
    return distance;
    }
    public double getSemblance(SimHashAlgorithm s2) {
    int distance = hammingDistance(s2);
    return 1 - (distance / hashbits * 100);
    }
    public static String getPercentValue(double similarity) {
    NumberFormat fmt = NumberFormat.getPercentInstance();
    fmt.setMaximumFractionDigits(2);
    return fmt.format(similarity);
    }
    public static void main(String[] args) {
    String[] str1 = {"?????", "1234567890"};
    String[] str2 = {"??????", "1234567890"};
    SimHashAlgorithm hash1 = new SimHashAlgorithm(str1[0] + str1[1]);
    SimHashAlgorithm hash2 = new SimHashAlgorithm(str2[0] + str2[1]);
    double similarity = hash1.getSemblance(hash2);
    System.out.println("?????" + getPercentValue(similarity) + "%");
    }
    }

    ?????????????

    ??????????????????????

    • ?????????????????67.19%
    • 1234567890?1234567890?????100%

    ????????????????????????????????????????

    转载地址:http://eynb.baihongyu.com/

    你可能感兴趣的文章
    Netty源码—3.Reactor线程模型三
    查看>>
    Netty源码—4.客户端接入流程一
    查看>>
    Netty源码—4.客户端接入流程二
    查看>>
    Netty源码—5.Pipeline和Handler一
    查看>>
    Netty源码—5.Pipeline和Handler二
    查看>>
    Netty源码—6.ByteBuf原理一
    查看>>
    Netty源码—6.ByteBuf原理二
    查看>>
    Netty源码—7.ByteBuf原理三
    查看>>
    Netty源码—7.ByteBuf原理四
    查看>>
    Netty源码—8.编解码原理一
    查看>>
    Netty源码—8.编解码原理二
    查看>>
    Netty源码解读
    查看>>
    Netty的Socket编程详解-搭建服务端与客户端并进行数据传输
    查看>>
    Netty相关
    查看>>
    Netty遇到TCP发送缓冲区满了 写半包操作该如何处理
    查看>>
    Netty:ChannelPipeline和ChannelHandler为什么会鬼混在一起?
    查看>>
    Netty:原理架构解析
    查看>>
    Network Dissection:Quantifying Interpretability of Deep Visual Representations(深层视觉表征的量化解释)
    查看>>
    Network Sniffer and Connection Analyzer
    查看>>
    Network 灰鸽宝典【目录】
    查看>>