|
race();
}
// 将每行词做拆分加入到总列表容器中
for (String[] array : dataArray) {
for (String word : array) {
words.add(word);
}
}
return words;
}
/**
* 对文档内容词汇进行预处理
*/
public void preTreatWords() {
String baseOutputPath = "";
int endPos = 0;
ArrayList tempWords = null;
effectWordPaths = new ArrayList<>();
for (String filePath : docFilePaths) {
tempWords = readDataFile(filePath);
filterWords(tempWords, true);
// 重新组装出新的输出路径
endPos = filePath.lastIndexOf(".");
baseOutputPath = filePath.substring(0, endPos);
writeOutOperation(tempWords, baseOutputPath + "-efword.txt");
effectWordPaths.add(baseOutputPath + "-efword.txt");
}
}
/**
*
* 对文档中的词语进行过滤操作
*
* @param words
* 待处理文档词语
* @param canRepeated
* 有效词是否可以重复
*/
private void filterWords(ArrayList words, boolean canRepeated) {
boolean isFilterWord;
// 做形容词匹配
Pattern adjPattern;
// 做动词时态的匹配
Pattern formerPattern;
// 数字匹配
Pattern numberPattern;
Matcher adjMatcher;
Matcher formerMatcher;
Matcher numberMatcher;
ArrayList deleteWords = new ArrayList<>();
adjPattern = Pattern.compile(".*(ly$|ful$|ing$)");
formerPattern = Pattern.compile(".*ed$");
numberPattern = Pattern.compile("[0-9]+(.[0-9]+)?");
String w;
for (int i = 0; i < words.size(); i++) {
w = words.get(i);
isFilterWord = false;
for (String fw : FILTER_WORDS) {
if (fw.equals(w)) {
deleteWords.add(w);
isFilterWord = true;
break;
}
}
if (isFilterWord) {
continue;
}
adjMatcher = adjPattern.matcher(w);
formerMatcher = formerPattern.matcher(w);
numberMatcher = numberPattern.matcher(w);
// 将词语统一小写字母化
w = w.toLowerCase();
// 如果是形容词,副词形式的或是纯数字的词,则进行过滤
if (adjMatcher.matches() || numberMatcher.matches()) {
deleteWords.add(w);
} else if (formerMatcher.matches()) {
// 如果是ed结尾表明是动词的在时态方面的变化,进行变化,转为原有动词的形式,截去最末尾2个额外添加的后缀词
w = w.substring(0, w.length() - 2);
}
words.set(i, w);
}
// 进行无效词的过滤
words.removeAll(deleteWords);
deleteWords.clear();
String s1;
String s2;
// 进行词语的去重
for (int i = 0; i < words.size() - 1; i++) {
s1 = words.get(i);
for (int j = i + 1; j < words.size(); j++) {
s2 = words.get(j);
// 找到存在相同的词了,就挑出循环
if (s1.equals(s2)) {
deleteWords.add(s1);
break;
}
}
}
// 删除多余重复的词语
words.removeAll(deleteWords);
words.addAll(deleteWords);
}
/**
* 将数据写出到磁盘文件操作,如果文件已经存在,则在文件尾部进行内容追加
*
* @param buffer
* 当前写缓冲中的数据
* @param filePath
* 输出地址
*/
private void writeOutOperation(ArrayList buffer, String filePath) {
StringBuilder strBuilder = new StringBuilder();
// 将缓冲中的数据组成字符写入到文件中
for (String word : buffer) {
strBuilder.append(word);
strBuilder.append("\n");
}
try {
File file = new File(filePath);
PrintStream ps = new PrintStream(new FileOutputStream(file));
ps.print(strBuilder.toString());// 往文件里写入字符串
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
文档类Document.java:
?
?
package InvertedIndex;
import java.util.ArrayList;
/**
* 文档类
* @author lyq
*
*/
public class Document {
//文档的唯一标识
int docId;
//文档的文件地址
String filePath;
//文档中的有效词
ArrayList effectWords;
public Document(ArrayList effectWords, String filePath){
this.effectWords = effectWords;
this.filePath = filePath;
}
public Document(ArrayList effectWords, String filePath, int docId){
this(effectWords, filePath);
this.docId = docId;
}
}
BSBI算法工具类BSBITool.java:
?
?
package InvertedIndex;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
/**
* BSBI基于磁盘的外部排序算法
*
* @author lyq
*
*/
public class BSBITool {
// 文档唯一标识ID
public static int DOC_ID = 0;
// 读缓冲区的大小
private int readBufferSize;
// 写缓冲区的大小
private int writeBufferSize;
// 读入的文档的有效词文件地址
private ArrayList effectiveWordFiles;
// 倒排索引输出文件地址
private String outputFilePath;
// 读缓冲 1
private String[][] readBuffer1;
// 读缓冲2
private String[][] readBuffer2;
// 写缓冲区
private String[][] writeBuffer;
// 有效词与hashcode的映射
private Map code2word;
public BSBITool(ArrayList effectiveWor |