倒排索引构建算法BSBI和SPIMI - 数据库编程

for (int i = 0; i < buffer.length - 1; i++) { // 缓冲区可能没填满 if (buffer[i][0] == null) { continue; } k = i; for (int j = i + 1; j < buffer.length; j++) { // 缓冲区可能没填满 if (buffer[j][0] == null) { continue; } // 获取2个缓冲区小块的起始编号值 num1 = Long.parseLong(buffer[k][0]); num2 = Long.parseLong(buffer[j][0]); if (num2 < num1) { k = j; } } if (k != i) { temp = buffer[k]; buffer[k] = buffer[i]; buffer[i] = temp; } } } /** * 从文件中读取倒排索引数据 * * @param filePath * 单个文件 */ private ArrayList readInvertedFile(String filePath) { File file = new File(filePath); ArrayList dataArray = new ArrayList(); try { BufferedReader in = new BufferedReader(new FileReader(file)); String str; String[] tempArray; while ((str = in.readLine()) != null) { tempArray = str.split(" "); dataArray.add(tempArray); } in.close(); } catch (IOException e) { e.getStackTrace(); } return dataArray; } /** * 从文件中读取数据 * * @param filePath * 单个文件 */ private ArrayList readDataFile(String filePath) { File file = new File(filePath); ArrayList dataArray = new ArrayList(); ArrayList words = new ArrayList<>(); try { BufferedReader in = new BufferedReader(new FileReader(file)); String str; String[] tempArray; while ((str = in.readLine()) != null) { tempArray = str.split(" "); dataArray.add(tempArray); } in.close(); } catch (IOException e) { e.getStackTrace(); } // 将每行词做拆分加入到总列表容器中 for (String[] array : dataArray) { for (String word : array) { if (!word.equals("")) { words.add(word); } } } return words; } } SPIMI算法工具类SPIMITool.java:

package InvertedIndex;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;

/**
 * SPIMI内存式单边扫描构建算法
 * @author lyq
 *
 */
public class SPIMITool {
	//倒排索引输出文件地址
	private String outputFilePath;
	// 读入的文档的有效词文件地址
	private ArrayList




 effectiveWordFiles;
	// 内存缓冲区，不够还能够在增加空间
	private ArrayList buffers;
	
	public SPIMITool(ArrayList effectiveWordFiles){
		this.effectiveWordFiles = effectiveWordFiles;
	}
	
	/**
	 * 从文件中读取数据
	 * 
	 * @param filePath
	 *            单个文件
	 */
	private ArrayList readDataFile(String filePath) {
		File file = new File(filePath);
		ArrayList dataArray = new ArrayList();
		ArrayList words = new ArrayList<>();

		try {
			BufferedReader in = new BufferedReader(new FileReader(file));
			String str;
			String[] tempArray;
			while ((str = in.readLine()) != null) {
				tempArray = str.split(" ");
				dataArray.add(tempArray);
			}
			in.close();
		} catch (IOException e) {
			e.getStackTrace();
		}

		// 将每行词做拆分加入到总列表容器中
		for (String[] array : dataArray) {
			for (String word : array) {
				words.add(word);
			}
		}

		return words;
	}
 
	
	/**
	 * 根据已有的文档数据进行倒排索引文件的构建
	 * @param docs
	 * 文档集合
	 */
	private void writeInvertedIndex(ArrayList docs){
		ArrayList datas;
		String[] recordData;
		
		buffers = new ArrayList<>();
		for(Document tempDoc: docs){
			datas = tempDoc.effectWords;
			
			for(String word: datas){
				recordData = new String[2];
				recordData[0] = word;
				recordData[1] = tempDoc.docId + "";
				
				addRecordToBuffer(recordData);
			}
		}
		
		//最后将数据写出到磁盘中
		writeOutOperation(buffers, outputFilePath);
	}
	
	/**
	 * 将新读入的数据记录读入到内存缓冲中，如果存在则加入到倒排记录表中
	 * @param insertedData
	 * 待插入的数据
	 */
	private void addRecordToBuffer(String[] insertedData){
		boolean isContained = false;
		String wor

倒排索引构建算法BSBI和SPIMI(六)