倒排索引构建算法BSBI和SPIMI(六)

2015-07-24 12:16:29 · 作者: · 浏览: 782
for (int i = 0; i < buffer.length - 1; i++) { // 缓冲区可能没填满 if (buffer[i][0] == null) { continue; } k = i; for (int j = i + 1; j < buffer.length; j++) { // 缓冲区可能没填满 if (buffer[j][0] == null) { continue; } // 获取2个缓冲区小块的起始编号值 num1 = Long.parseLong(buffer[k][0]); num2 = Long.parseLong(buffer[j][0]); if (num2 < num1) { k = j; } } if (k != i) { temp = buffer[k]; buffer[k] = buffer[i]; buffer[i] = temp; } } } /** * 从文件中读取倒排索引数据 * * @param filePath * 单个文件 */ private ArrayList readInvertedFile(String filePath) { File file = new File(filePath); ArrayList dataArray = new ArrayList(); try { BufferedReader in = new BufferedReader(new FileReader(file)); String str; String[] tempArray; while ((str = in.readLine()) != null) { tempArray = str.split(" "); dataArray.add(tempArray); } in.close(); } catch (IOException e) { e.getStackTrace(); } return dataArray; } /** * 从文件中读取数据 * * @param filePath * 单个文件 */ private ArrayList readDataFile(String filePath) { File file = new File(filePath); ArrayList dataArray = new ArrayList(); ArrayList words = new ArrayList<>(); try { BufferedReader in = new BufferedReader(new FileReader(file)); String str; String[] tempArray; while ((str = in.readLine()) != null) { tempArray = str.split(" "); dataArray.add(tempArray); } in.close(); } catch (IOException e) { e.getStackTrace(); } // 将每行词做拆分加入到总列表容器中 for (String[] array : dataArray) { for (String word : array) { if (!word.equals("")) { words.add(word); } } } return words; } } SPIMI算法工具类SPIMITool.java:

?

?

package InvertedIndex;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;

/**
 * SPIMI内存式单边扫描构建算法
 * @author lyq
 *
 */
public class SPIMITool {
	//倒排索引输出文件地址
	private String outputFilePath;
	// 读入的文档的有效词文件地址
	private ArrayList
effectiveWordFiles; // 内存缓冲区,不够还能够在增加空间 private ArrayList buffers; public SPIMITool(ArrayList effectiveWordFiles){ this.effectiveWordFiles = effectiveWordFiles; } /** * 从文件中读取数据 * * @param filePath * 单个文件 */ private ArrayList readDataFile(String filePath) { File file = new File(filePath); ArrayList dataArray = new ArrayList(); ArrayList words = new ArrayList<>(); try { BufferedReader in = new BufferedReader(new FileReader(file)); String str; String[] tempArray; while ((str = in.readLine()) != null) { tempArray = str.split(" "); dataArray.add(tempArray); } in.close(); } catch (IOException e) { e.getStackTrace(); } // 将每行词做拆分加入到总列表容器中 for (String[] array : dataArray) { for (String word : array) { words.add(word); } } return words; } /** * 根据已有的文档数据进行倒排索引文件的构建 * @param docs * 文档集合 */ private void writeInvertedIndex(ArrayList docs){ ArrayList datas; String[] recordData; buffers = new ArrayList<>(); for(Document tempDoc: docs){ datas = tempDoc.effectWords; for(String word: datas){ recordData = new String[2]; recordData[0] = word; recordData[1] = tempDoc.docId + ""; addRecordToBuffer(recordData); } } //最后将数据写出到磁盘中 writeOutOperation(buffers, outputFilePath); } /** * 将新读入的数据记录读入到内存缓冲中,如果存在则加入到倒排记录表中 * @param insertedData * 待插入的数据 */ private void addRecordToBuffer(String[] insertedData){ boolean isContained = false; String wor