|
dFiles, int readBufferSize,
int writeBufferSize) {
this.effectiveWordFiles = effectiveWordFiles;
this.readBufferSize = readBufferSize;
this.writeBufferSize = writeBufferSize;
initBuffers();
}
/**
* 初始化缓冲区的设置
*/
private void initBuffers() {
readBuffer1 = new String[readBufferSize][2];
readBuffer2 = new String[readBufferSize][2];
writeBuffer = new String[writeBufferSize][2];
}
/**
* 从文件中读取有效词并进行编码替换
*
* @param filePath
* 返回文档
*/
private Document readEffectWords(String filePath) {
long hashcode = 0;
String w;
Document document;
code2word = new HashMap();
ArrayList words;
words = readDataFile(filePath);
for (int i = 0; i < words.size(); i++) {
w = words.get(i);
hashcode = BKDRHash(w);
hashcode = hashcode % 10000;
// 将有效词的hashcode取模值作为对应的代表
code2word.put(hashcode + "", w);
w = hashcode + "";
words.set(i, w);
}
document = new Document(words, filePath, DOC_ID);
DOC_ID++;
return document;
}
/**
* 将字符做哈希值的转换
*
* @param str
* 待转换字符
* @return
*/
private long BKDRHash(String str) {
int seed = 31; /* 31 131 1313 13131 131313 etc.. */
long hash = 0;
int i = 0;
for (i = 0; i < str.length(); i++) {
hash = (hash * seed) + (str.charAt(i));
}
return hash;
}
/**
* 根据输入的有效词输出倒排索引文件
*/
public void outputInvertedFiles() {
int index = 0;
String baseFilePath = "";
outputFilePath = "";
Document doc;
ArrayList tempPaths;
ArrayList invertedData1;
ArrayList invertedData2;
tempPaths = new ArrayList<>();
for (String filePath : effectiveWordFiles) {
doc = readEffectWords(filePath);
writeOutFile(doc);
index = doc.filePath.lastIndexOf(".");
baseFilePath = doc.filePath.substring(0, index);
writeOutOperation(writeBuffer, baseFilePath + "-temp.txt");
tempPaths.add(baseFilePath + "-temp.txt");
}
outputFilePath = baseFilePath + "-bsbi-inverted.txt";
// 将中间产生的倒排索引数据进行总的合并并输出到一个文件中
for (int i = 1; i < tempPaths.size(); i++) {
if (i == 1) {
invertedData1 = readInvertedFile(tempPaths.get(0));
} else {
invertedData1 = readInvertedFile(outputFilePath);
}
invertedData2 = readInvertedFile(tempPaths.get(i));
mergeInvertedData(invertedData1, invertedData2, false,
outputFilePath);
writeOutOperation(writeBuffer, outputFilePath, false);
}
}
/**
* 将文档的最终的倒排索引结果写出到文件
*
* @param doc
* 待处理文档
*/
private void writeOutFile(Document doc) {
// 在读缓冲区中是否需要再排序
boolean ifSort = true;
int index = 0;
String baseFilePath;
String[] temp;
ArrayList tempWords = (ArrayList) doc.effectWords
.clone();
ArrayList invertedData1;
ArrayList invertedData2;
invertedData1 = new ArrayList<>();
invertedData2 = new ArrayList<>();
// 将文档的数据平均拆分成2份,用于读入后面的2个缓冲区中
for (int i = 0; i < tempWords.size() / 2; i++) {
temp = new String[2];
temp[0] = tempWords.get(i);
temp[1] = doc.docId + "";
invertedData1.add(temp);
temp = new String[2];
temp[0] = tempWords.get(i + tempWords.size() / 2);
temp[1] = doc.docId + "";
invertedData2.add(temp);
}
// 如果是奇数个,则将最后一个补入
if (tempWords.size() % 2 == 1) {
temp = new String[2];
temp[0] = tempWords.get(tempWords.size() - 1);
temp[1] = doc.docId + "";
invertedData2.add(temp);
}
index = doc.filePath.lastIndexOf(".");
baseFilePath = doc.filePath.substring(0, index);
mergeInvertedData(invertedData1, invertedData2, ifSort, baseFilePath
+ "-temp.txt");
}
/**
* 合并读缓冲区数据写到写缓冲区中,用到了归并排序算法
*
* @param outputPath
* 写缓冲区的写出的路径
*/
private void mergeWordBuffers(String outputPath) {
int i = 0;
int j = 0;
int num1 = 0;
int num2 = 0;
// 写缓冲区下标
int writeIndex = 0;
while (readBuffer1[i][0] != null && readBuffer2[j][0] != null) {
num1 = Integer.parseInt(readBuffer1[i][0]);
num2 = Integer.parseInt(readBuffer2[j][0]);
// 如果缓冲1小,则优先存缓冲1到写缓冲区中
if (num1 < num2) {
writeBuffer[writeIndex][0] = num1 + "";
writeBuffer[writeIndex][1] = readBuffer1[i][1];
i++;
} else if (num2 < num1) {
writeBuffer[writeIndex][0] = num2 + "";
writeBuffer[writeIndex][1] = readBuffer1[j][1];
j++;
} else if (num1 == num2) {
// 如果两个缓冲区中的数字一样,说明是同个有效词,先进行合并再写入
writeBuffer[writeIndex][0] = num1 + "";
writeBuffer[writeIndex][1] = readBuffer1[i][1] + ":"
+ readBuffer2[j][1];
i++;
j++;
}
// 写的指针往后挪一位
writeIndex++;
// 如果写满写缓冲区时,进行写出到文件操作
if (writeIndex >= writeBufferSize) {
writeOutOperation(writeBuffer, outputPath);
writeIndex = 0;
}
}
if (readBuffer1[i][0] == null) {
writeRemainReadBuffer(readBuffer2, j, outputPath);
}
if (readBuffer2[j][0] == null) {
writeRemainReadBuffer(readBuffer1, j, outputPath);
}
}
/**
* 将数据写出到磁盘文件操作,如果文件已经存在,则在文件尾部进行内容追加
*
* @param buffer
* 当前写缓冲中的数据
* @param filePath
* 输出地址
*/
private void writeOutOperation(String[][] buffer, String filePath) {
String word;
StringBuilder strBuilder = new StringBuilder();
// 将缓冲中的数据组成字符写入到文件中
for (String[] array : buffer) {
if (array[0] == null) {
continue;
}
word = array[0];
strBuilder.append(word);
strBuilder.append(" ");
strBuilder.append(array[1]);
strBuilder.append("\n");
}
try {
File file = new File(filePath);
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.print(strBuilder.toString());// 往文件里写入字符串
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 将数据写出到磁盘文件操作,如果文件已经存在,则在文件尾部进行内容追加
*
* @param buffer
* 当前写缓冲中的数据
* @param filePath
* 输出地址
* @param isCoded
* 是否以编码的方式输出
*/
private void writeOutOperation(String[][] buffer, String filePath, boolean isCoded) {
String word;
StringBuilder strBuilder = new StringBuilder();
// 将缓冲中的数据组成字符写入到文件中
for (String[] array : buffer) {
if (array[0] == null) {
continue;
}
if(!isCoded){
word = code2word.get(array[0]);
}else{
word = array[0];
}
strBuilder.append(word);
strBuilder.append(" ");
strBuilder.append(array[1]);
strBuilder.ap |