倒排索引构建算法BSBI和SPIMI - 数据库编程

TOP

倒排索引构建算法BSBI和SPIMI(二)

2015-07-24 12:16:29 来源: 作者: 【大中小】浏览:762次

race(); } // 将每行词做拆分加入到总列表容器中 for (String[] array : dataArray) { for (String word : array) { words.add(word); } } return words; } /** * 对文档内容词汇进行预处理 */ public void preTreatWords() { String baseOutputPath = ""; int endPos = 0; ArrayList tempWords = null; effectWordPaths = new ArrayList<>(); for (String filePath : docFilePaths) { tempWords = readDataFile(filePath); filterWords(tempWords, true); // 重新组装出新的输出路径 endPos = filePath.lastIndexOf("."); baseOutputPath = filePath.substring(0, endPos); writeOutOperation(tempWords, baseOutputPath + "-efword.txt"); effectWordPaths.add(baseOutputPath + "-efword.txt"); } } /** * * 对文档中的词语进行过滤操作 * * @param words * 待处理文档词语 * @param canRepeated * 有效词是否可以重复 */ private void filterWords(ArrayList words, boolean canRepeated) { boolean isFilterWord; // 做形容词匹配 Pattern adjPattern; // 做动词时态的匹配 Pattern formerPattern; // 数字匹配 Pattern numberPattern; Matcher adjMatcher; Matcher formerMatcher; Matcher numberMatcher; ArrayList deleteWords = new ArrayList<>(); adjPattern = Pattern.compile(".*(ly$|ful$|ing$)"); formerPattern = Pattern.compile(".*ed$"); numberPattern = Pattern.compile("[0-9]+(.[0-9]+)?"); String w; for (int i = 0; i < words.size(); i++) { w = words.get(i); isFilterWord = false; for (String fw : FILTER_WORDS) { if (fw.equals(w)) { deleteWords.add(w); isFilterWord = true; break; } } if (isFilterWord) { continue; } adjMatcher = adjPattern.matcher(w); formerMatcher = formerPattern.matcher(w); numberMatcher = numberPattern.matcher(w); // 将词语统一小写字母化 w = w.toLowerCase(); // 如果是形容词,副词形式的或是纯数字的词，则进行过滤 if (adjMatcher.matches() || numberMatcher.matches()) { deleteWords.add(w); } else if (formerMatcher.matches()) { // 如果是ed结尾表明是动词的在时态方面的变化，进行变化，转为原有动词的形式，截去最末尾2个额外添加的后缀词 w = w.substring(0, w.length() - 2); } words.set(i, w); } // 进行无效词的过滤 words.removeAll(deleteWords); deleteWords.clear(); String s1; String s2; // 进行词语的去重 for (int i = 0; i < words.size() - 1; i++) { s1 = words.get(i); for (int j = i + 1; j < words.size(); j++) { s2 = words.get(j); // 找到存在相同的词了，就挑出循环 if (s1.equals(s2)) { deleteWords.add(s1); break; } } } // 删除多余重复的词语 words.removeAll(deleteWords); words.addAll(deleteWords); } /** * 将数据写出到磁盘文件操作，如果文件已经存在，则在文件尾部进行内容追加 * * @param buffer * 当前写缓冲中的数据 * @param filePath * 输出地址 */ private void writeOutOperation(ArrayList buffer, String filePath) { StringBuilder strBuilder = new StringBuilder(); // 将缓冲中的数据组成字符写入到文件中 for (String word : buffer) { strBuilder.append(word); strBuilder.append("\n"); } try { File file = new File(filePath); PrintStream ps = new PrintStream(new FileOutputStream(file)); ps.print(strBuilder.toString());// 往文件里写入字符串 } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } 文档类Document.java:

package InvertedIndex;

import java.util.ArrayList;

/**
 * 文档类
 * @author lyq
 *
 */
public class Document {
	//文档的唯一标识
	int docId;
	//文档的文件地址
	String filePath;
	//文档中的有效词
	ArrayList effectWords;
	
	public Document(ArrayList effectWords, String filePath){
		this.effectWords = effectWords;
		this.filePath = filePath;
	}
	
	public Document(ArrayList effectWords, String filePath, int docId){
		this(effectWords, filePath);
		this.docId = docId;
	}
}

BSBI算法工具类BSBITool.java:

package InvertedIndex;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

/**
 * BSBI基于磁盘的外部排序算法
 * 
 * @author lyq
 * 
 */
public class BSBITool {
	// 文档唯一标识ID
	public static int DOC_ID = 0;

	// 读缓冲区的大小
	private int readBufferSize;
	// 写缓冲区的大小
	private int writeBufferSize;
	// 读入的文档的有效词文件地址
	private ArrayList effectiveWordFiles;
	// 倒排索引输出文件地址
	private String outputFilePath;
	// 读缓冲 1
	private String[][] readBuffer1;
	// 读缓冲2
	private String[][] readBuffer2;
	// 写缓冲区
	private String[][] writeBuffer;
	// 有效词与hashcode的映射
	private Map code2word;

	public BSBITool(ArrayList effectiveWor

首页上一页 1 2 3 4 5 6 7 下一页尾页 2/7/7
【大中小】【打印】【繁体】【投稿】【收藏】【推荐】【举报】【评论】【关闭】【返回顶部】
分享到:
上一篇：通过DAC杀死指定会话	下一篇：Power Designer反向工程/正向工程..

帐　　号:

密码: (新用户注册)

验证码:

表　　情:

内　　容: