倒排索引构建算法BSBI和SPIMI - 数据库编程

dFiles, int readBufferSize, int writeBufferSize) { this.effectiveWordFiles = effectiveWordFiles; this.readBufferSize = readBufferSize; this.writeBufferSize = writeBufferSize; initBuffers(); } /** * 初始化缓冲区的设置 */ private void initBuffers() { readBuffer1 = new String[readBufferSize][2]; readBuffer2 = new String[readBufferSize][2]; writeBuffer = new String[writeBufferSize][2]; } /** * 从文件中读取有效词并进行编码替换 * * @param filePath * 返回文档 */ private Document readEffectWords(String filePath) { long hashcode = 0; String w; Document document; code2word = new HashMap(); ArrayList words; words = readDataFile(filePath); for (int i = 0; i < words.size(); i++) { w = words.get(i); hashcode = BKDRHash(w); hashcode = hashcode % 10000; // 将有效词的hashcode取模值作为对应的代表 code2word.put(hashcode + "", w); w = hashcode + ""; words.set(i, w); } document = new Document(words, filePath, DOC_ID); DOC_ID++; return document; } /** * 将字符做哈希值的转换 * * @param str * 待转换字符 * @return */ private long BKDRHash(String str) { int seed = 31; /* 31 131 1313 13131 131313 etc.. */ long hash = 0; int i = 0; for (i = 0; i < str.length(); i++) { hash = (hash * seed) + (str.charAt(i)); } return hash; } /** * 根据输入的有效词输出倒排索引文件 */ public void outputInvertedFiles() { int index = 0; String baseFilePath = ""; outputFilePath = ""; Document doc; ArrayList tempPaths; ArrayList invertedData1; ArrayList invertedData2; tempPaths = new ArrayList<>(); for (String filePath : effectiveWordFiles) { doc = readEffectWords(filePath); writeOutFile(doc); index = doc.filePath.lastIndexOf("."); baseFilePath = doc.filePath.substring(0, index); writeOutOperation(writeBuffer, baseFilePath + "-temp.txt"); tempPaths.add(baseFilePath + "-temp.txt"); } outputFilePath = baseFilePath + "-bsbi-inverted.txt"; // 将中间产生的倒排索引数据进行总的合并并输出到一个文件中 for (int i = 1; i < tempPaths.size(); i++) { if (i == 1) { invertedData1 = readInvertedFile(tempPaths.get(0)); } else { invertedData1 = readInvertedFile(outputFilePath); } invertedData2 = readInvertedFile(tempPaths.get(i)); mergeInvertedData(invertedData1, invertedData2, false, outputFilePath); writeOutOperation(writeBuffer, outputFilePath, false); } } /** * 将文档的最终的倒排索引结果写出到文件 * * @param doc * 待处理文档 */ private void writeOutFile(Document doc) { // 在读缓冲区中是否需要再排序 boolean ifSort = true; int index = 0; String baseFilePath; String[] temp; ArrayList tempWords = (ArrayList) doc.effectWords .clone(); ArrayList invertedData1; ArrayList invertedData2; invertedData1 = new ArrayList<>(); invertedData2 = new ArrayList<>(); // 将文档的数据平均拆分成2份，用于读入后面的2个缓冲区中 for (int i = 0; i < tempWords.size() / 2; i++) { temp = new String[2]; temp[0] = tempWords.get(i); temp[1] = doc.docId + ""; invertedData1.add(temp); temp = new String[2]; temp[0] = tempWords.get(i + tempWords.size() / 2); temp[1] = doc.docId + ""; invertedData2.add(temp); } // 如果是奇数个，则将最后一个补入 if (tempWords.size() % 2 == 1) { temp = new String[2]; temp[0] = tempWords.get(tempWords.size() - 1); temp[1] = doc.docId + ""; invertedData2.add(temp); } index = doc.filePath.lastIndexOf("."); baseFilePath = doc.filePath.substring(0, index); mergeInvertedData(invertedData1, invertedData2, ifSort, baseFilePath + "-temp.txt"); } /** * 合并读缓冲区数据写到写缓冲区中，用到了归并排序算法 * * @param outputPath * 写缓冲区的写出的路径 */ private void mergeWordBuffers(String outputPath) { int i = 0; int j = 0; int num1 = 0; int num2 = 0; // 写缓冲区下标 int writeIndex = 0; while (readBuffer1[i][0] != null && readBuffer2[j][0] != null) { num1 = Integer.parseInt(readBuffer1[i][0]); num2 = Integer.parseInt(readBuffer2[j][0]); // 如果缓冲1小，则优先存缓冲1到写缓冲区中 if (num1 < num2) { writeBuffer[writeIndex][0] = num1 + ""; writeBuffer[writeIndex][1] = readBuffer1[i][1]; i++; } else if (num2 < num1) { writeBuffer[writeIndex][0] = num2 + ""; writeBuffer[writeIndex][1] = readBuffer1[j][1]; j++; } else if (num1 == num2) { // 如果两个缓冲区中的数字一样，说明是同个有效词，先进行合并再写入 writeBuffer[writeIndex][0] = num1 + ""; writeBuffer[writeIndex][1] = readBuffer1[i][1] + ":" + readBuffer2[j][1]; i++; j++; } // 写的指针往后挪一位 writeIndex++; // 如果写满写缓冲区时，进行写出到文件操作 if (writeIndex >

= writeBufferSize) { writeOutOperation(writeBuffer, outputPath); writeIndex = 0; } } if (readBuffer1[i][0] == null) { writeRemainReadBuffer(readBuffer2, j, outputPath); } if (readBuffer2[j][0] == null) { writeRemainReadBuffer(readBuffer1, j, outputPath); } } /** * 将数据写出到磁盘文件操作，如果文件已经存在，则在文件尾部进行内容追加 * * @param buffer * 当前写缓冲中的数据 * @param filePath * 输出地址 */ private void writeOutOperation(String[][] buffer, String filePath) { String word; StringBuilder strBuilder = new StringBuilder(); // 将缓冲中的数据组成字符写入到文件中 for (String[] array : buffer) { if (array[0] == null) { continue; } word = array[0]; strBuilder.append(word); strBuilder.append(" "); strBuilder.append(array[1]); strBuilder.append("\n"); } try { File file = new File(filePath); PrintStream ps = new PrintStream(new FileOutputStream(file)); ps.print(strBuilder.toString());// 往文件里写入字符串 } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 将数据写出到磁盘文件操作，如果文件已经存在，则在文件尾部进行内容追加 * * @param buffer * 当前写缓冲中的数据 * @param filePath * 输出地址 * @param isCoded * 是否以编码的方式输出 */ private void writeOutOperation(String[][] buffer, String filePath, boolean isCoded) { String word; StringBuilder strBuilder = new StringBuilder(); // 将缓冲中的数据组成字符写入到文件中 for (String[] array : buffer) { if (array[0] == null) { continue; } if(!isCoded){ word = code2word.get(array[0]); }else{ word = array[0]; } strBuilder.append(word); strBuilder.append(" "); strBuilder.append(array[1]); strBuilder.ap

倒排索引构建算法BSBI和SPIMI(三)