|
dName;
wordName = insertedData[0];
for(String[] array: buffers){
if(array[0].equals(wordName)){
isContained = true;
//添加倒排索引记录,以:隔开
array[1] += ":" + insertedData[1];
break;
}
}
//如果没有包含,则说明是新的数据,直接添加
if(!isContained){
buffers.add(insertedData);
}
}
/**
* 将数据写出到磁盘文件操作,如果文件已经存在,则在文件尾部进行内容追加
* @param buffer
* 当前写缓冲中的数据
* @param filePath
* 输出地址
*/
private void writeOutOperation(ArrayList buffer, String filePath) {
StringBuilder strBuilder = new StringBuilder();
//将缓冲中的数据组成字符写入到文件中
for(String[] array: buffer){
strBuilder.append(array[0]);
strBuilder.append(" ");
strBuilder.append(array[1]);
strBuilder.append("\n");
}
try {
File file = new File(filePath);
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.println(strBuilder.toString());// 往文件里写入字符串
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 构造倒排索引文件
*/
public void createInvertedIndexFile(){
int docId = 1;
String baseFilePath;
String fileName;
String p;
int index1 = 0;
int index2 = 0;
Document tempDoc;
ArrayList words;
ArrayList docs;
outputFilePath = "spimi";
docs = new ArrayList<>();
p = effectiveWordFiles.get(0);
//提取文件名称
index1 = p.lastIndexOf("\\");
baseFilePath = p.substring(0, index1+1);
outputFilePath = baseFilePath + "spimi";
for(String path: effectiveWordFiles){
//获取文档有效词
words = readDataFile(path);
tempDoc = new Document(words, path, docId);
docId++;
docs.add(tempDoc);
//提取文件名称
index1 = path.lastIndexOf("\\");
index2 = path.lastIndexOf(".");
fileName = path.substring(index1+1, index2);
outputFilePath += "-" + fileName;
}
outputFilePath += ".txt";
//根据文档数据进行倒排索引文件的创建
writeInvertedIndex(docs);
}
}
算法测试类Client.java:
?
?
package InvertedIndex;
import java.util.ArrayList;
/**
* 倒排索引测试类
* @author lyq
*
*/
public class Client {
public static void main(String[] args){
//读写缓冲区的大小
int readBufferSize;
int writeBufferSize;
String baseFilePath;
PreTreatTool preTool;
//BSBI基于磁盘的外部排序算法
BSBITool bTool;
//SPIMI内存式单边扫描构建算法
SPIMITool sTool;
//有效词文件路径
ArrayList efwFilePaths;
ArrayList docFilePaths;
readBufferSize = 10;
writeBufferSize = 20;
baseFilePath = "C:\\Users\\lyq\\Desktop\\icon\\";
docFilePaths = new ArrayList<>();
docFilePaths.add(baseFilePath + "doc1.txt");
docFilePaths.add(baseFilePath + "doc2.txt");
//文档预处理工具类
preTool = new PreTreatTool(docFilePaths);
preTool.preTreatWords();
//预处理完获取有效词文件路径
efwFilePaths = preTool.getEFWPaths();
bTool = new BSBITool(efwFilePaths, readBufferSize, writeBufferSize);
bTool.outputInvertedFiles();
sTool = new SPIMITool(efwFilePaths);
sTool.createInvertedIndexFile();
}
}
算法的输出:
?
为了模拟出真实性,算法的输出都是以文件的形式。
首先是预处理类处理之后的有效词文件doc1-efword.txt和doc2-efword.txt:
?
mike
study
yesterday
got
last
exam
thinks
english
he
可以看见,一些修饰词什么的已经被我过滤掉了。
?
下面是B |