|
for (int i = 0; i < buffer.length - 1; i++) {
// 缓冲区可能没填满
if (buffer[i][0] == null) {
continue;
}
k = i;
for (int j = i + 1; j < buffer.length; j++) {
// 缓冲区可能没填满
if (buffer[j][0] == null) {
continue;
}
// 获取2个缓冲区小块的起始编号值
num1 = Long.parseLong(buffer[k][0]);
num2 = Long.parseLong(buffer[j][0]);
if (num2 < num1) {
k = j;
}
}
if (k != i) {
temp = buffer[k];
buffer[k] = buffer[i];
buffer[i] = temp;
}
}
}
/**
* 从文件中读取倒排索引数据
*
* @param filePath
* 单个文件
*/
private ArrayList readInvertedFile(String filePath) {
File file = new File(filePath);
ArrayList dataArray = new ArrayList();
try {
BufferedReader in = new BufferedReader(new FileReader(file));
String str;
String[] tempArray;
while ((str = in.readLine()) != null) {
tempArray = str.split(" ");
dataArray.add(tempArray);
}
in.close();
} catch (IOException e) {
e.getStackTrace();
}
return dataArray;
}
/**
* 从文件中读取数据
*
* @param filePath
* 单个文件
*/
private ArrayList readDataFile(String filePath) {
File file = new File(filePath);
ArrayList dataArray = new ArrayList();
ArrayList words = new ArrayList<>();
try {
BufferedReader in = new BufferedReader(new FileReader(file));
String str;
String[] tempArray;
while ((str = in.readLine()) != null) {
tempArray = str.split(" ");
dataArray.add(tempArray);
}
in.close();
} catch (IOException e) {
e.getStackTrace();
}
// 将每行词做拆分加入到总列表容器中
for (String[] array : dataArray) {
for (String word : array) {
if (!word.equals("")) {
words.add(word);
}
}
}
return words;
}
}
SPIMI算法工具类SPIMITool.java:
?
?
package InvertedIndex;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
/**
* SPIMI内存式单边扫描构建算法
* @author lyq
*
*/
public class SPIMITool {
//倒排索引输出文件地址
private String outputFilePath;
// 读入的文档的有效词文件地址
private ArrayList effectiveWordFiles;
// 内存缓冲区,不够还能够在增加空间
private ArrayList buffers;
public SPIMITool(ArrayList effectiveWordFiles){
this.effectiveWordFiles = effectiveWordFiles;
}
/**
* 从文件中读取数据
*
* @param filePath
* 单个文件
*/
private ArrayList readDataFile(String filePath) {
File file = new File(filePath);
ArrayList dataArray = new ArrayList();
ArrayList words = new ArrayList<>();
try {
BufferedReader in = new BufferedReader(new FileReader(file));
String str;
String[] tempArray;
while ((str = in.readLine()) != null) {
tempArray = str.split(" ");
dataArray.add(tempArray);
}
in.close();
} catch (IOException e) {
e.getStackTrace();
}
// 将每行词做拆分加入到总列表容器中
for (String[] array : dataArray) {
for (String word : array) {
words.add(word);
}
}
return words;
}
/**
* 根据已有的文档数据进行倒排索引文件的构建
* @param docs
* 文档集合
*/
private void writeInvertedIndex(ArrayList docs){
ArrayList datas;
String[] recordData;
buffers = new ArrayList<>();
for(Document tempDoc: docs){
datas = tempDoc.effectWords;
for(String word: datas){
recordData = new String[2];
recordData[0] = word;
recordData[1] = tempDoc.docId + "";
addRecordToBuffer(recordData);
}
}
//最后将数据写出到磁盘中
writeOutOperation(buffers, outputFilePath);
}
/**
* 将新读入的数据记录读入到内存缓冲中,如果存在则加入到倒排记录表中
* @param insertedData
* 待插入的数据
*/
private void addRecordToBuffer(String[] insertedData){
boolean isContained = false;
String wor |