博客
关于我
Java实现哈希(相似度)算法,用于试题相似度,字符串相似度等场景
阅读量:117 次
发布时间:2019-02-27

本文共 4488 字,大约阅读时间需要 14 分钟。

??????????????????????????????????????????????????????????????????????????????????????????????????????????

???????????

??????????Java?HashMap?????????????????????????????????

  • ????????????????????
  • ???????????????????????????51???????100101?101011?
  • ?????????????????2????????????
  • ?????????????????????????
  • ??????????????????????????????simhash???
  • ??????????????????????????????????????????


    ???????????

    ???????????????????????

  • ???????HTML???????????????????
  • ???????????????????????????
  • ???????????????????????????????????????
  • ?????????????????????????????
  • ???????
    • ????????????????????
    • ???????AND?XOR?????????????
    • ?????????????????????
  • ???????

    public class SimHashAlgorithm {    private String tokens;    private BigInteger strSimHash;    private int hashbits = 64;    public SimHashAlgorithm(String tokens) {        this.tokens = tokens;        this.strSimHash = computeSimHash(tokens);    }    private BigInteger computeSimHash(String tokens) {        tokens = cleanResume(tokens);        int[] v = new int[hashbits];        List
    termList = StandardTokenizer.segment(tokens); Map
    wordCount = new HashMap<>(); Map
    stopNatures = new HashMap<>(); stopNatures.put("w", ""); wordCount.put("n", 2); for (Term term : termList) { String word = term.word; String nature = term.nature.toString(); if (stopNatures.containsKey(nature)) continue; if (wordCount.containsKey(word)) { int count = wordCount.get(word); if (count > 5) continue; wordCount.put(word, count + 1); } else { wordCount.put(word, 1); } BigInteger t = hash(word); for (int i = 0; i < hashbits; i++) { BigInteger bitmask = BigInteger.ONE.shiftLeft(i); if (t.and(bitmask).signum() != 0) { v[i] += wordCount.get(word) * weightOfNature.getOrDefault(nature, 1); } else { v[i] -= wordCount.get(word) * weightOfNature.getOrDefault(nature, 1); } } } BigInteger fingerprint = BigInteger.ZERO; for (int i = 0; i < hashbits; i++) { if (v[i] > 0) { fingerprint = fingerprint.add(BigInteger.ONE.shiftLeft(i)); } } return fingerprint; } private BigInteger hash(String source) { if (source == null || source.isEmpty()) return BigInteger.ZERO; StringBuilder sb = new StringBuilder(); while (sb.length() < 64) { sb.append(source.charAt(0)); source = source.substring(1); } source = sb.toString(); char[] chars = source.toCharArray(); BigInteger x = BigInteger.valueOf((long) chars[0] << 7); BigInteger m = new BigInteger("1000003"); BigInteger mask = m.pow(hashbits).subtract(BigInteger.ONE); for (char c : chars) { BigInteger temp = BigInteger.valueOf((long) c); x = x.multiply(m).xor(temp).and(mask); } x = x.xor(BigInteger.valueOf(source.length())); return x.equals(BigInteger.valueOf(-1)) ? BigInteger.valueOf(-2) : x; } public int hammingDistance(SimHashAlgorithm other) { BigInteger m = BigInteger.ONE.shiftLeft(hashbits).subtract(BigInteger.ONE); BigInteger x = strSimHash.xor(other.strSimHash).and(m); int distance = 0; while (x.signum() != 0) { distance++; x = x.and(x.subtract(BigInteger.ONE)); } return distance; } public double getSemblance(SimHashAlgorithm s2) { int distance = hammingDistance(s2); return 1 - (distance / hashbits * 100); } public static String getPercentValue(double similarity) { NumberFormat fmt = NumberFormat.getPercentInstance(); fmt.setMaximumFractionDigits(2); return fmt.format(similarity); } public static void main(String[] args) { String[] str1 = {"?????", "1234567890"}; String[] str2 = {"??????", "1234567890"}; SimHashAlgorithm hash1 = new SimHashAlgorithm(str1[0] + str1[1]); SimHashAlgorithm hash2 = new SimHashAlgorithm(str2[0] + str2[1]); double similarity = hash1.getSemblance(hash2); System.out.println("?????" + getPercentValue(similarity) + "%"); }}

    ?????????????

    ??????????????????????

    • ?????????????????67.19%
    • 1234567890?1234567890?????100%

    ????????????????????????????????????????

    转载地址:http://eynb.baihongyu.com/

    你可能感兴趣的文章
    Opencv介绍及opencv3.0在 vs2010上的配置
    查看>>
    OpenCV使用霍夫变换检测图像中的形状
    查看>>
    opencv保存图片路径包含中文乱码解决方案
    查看>>
    OpenCV保证输入图像为三通道
    查看>>
    OpenCV入门教程(非常详细)从零基础入门到精通,看完这一篇就够了
    查看>>
    opencv图像分割2-GMM
    查看>>
    opencv图像分割3-分水岭方法
    查看>>
    opencv图像切割1-KMeans方法
    查看>>
    OpenCV图像处理篇之阈值操作函数
    查看>>
    opencv图像特征融合-seamlessClone
    查看>>
    OpenCV图像的深浅拷贝
    查看>>
    OpenCV在Google Colboratory中不起作用
    查看>>
    OpenCV学习(13) 细化算法(1)(转)
    查看>>
    OpenCV学习笔记(27)KAZE 算法原理与源码分析(一)非线性扩散滤波
    查看>>
    OpenCV学堂 | CV开发者必须懂的9种距离度量方法,内含欧氏距离、切比雪夫距离等(建议收藏)
    查看>>
    OpenCV学堂 | OpenCV案例 | 基于轮廓分析对象提取
    查看>>
    OpenCV学堂 | YOLOv8与YOLO11自定义数据集迁移学习效果对比
    查看>>
    OpenCV学堂 | YOLOv8官方团队宣布YOLOv11 发布了
    查看>>
    OpenCV学堂 | YOLOv8实战 | 荧光显微镜细胞图像检测
    查看>>
    OpenCV学堂 | 汇总 | 深度学习图像去模糊技术与模型
    查看>>