newechoes/wasm/search/src/lib.rs

1025 lines
35 KiB
Rust
Raw Normal View History

use wasm_bindgen::prelude::*;
use utils_common::compression::from_compressed_with_max_version;
use crate::models::{ArticleSearchIndex, SearchRequest, SearchResult, SearchResultItem, HeadingNode, HeadingIndexEntry, SuggestionCandidate, SearchSuggestion, SuggestionType};
use std::collections::{HashMap, HashSet};
use web_sys;
pub mod models;
pub mod builder;
/// WASM入口点 - 搜索文章
#[wasm_bindgen]
pub fn search_articles(index_data: &[u8], request_json: &str) -> Result<String, JsValue> {
// 捕获Rust panic并转换为JS错误
std::panic::set_hook(Box::new(console_error_panic_hook::hook));
let start_time = web_sys::window()
.and_then(|w| w.performance())
.map(|p| p.now())
.unwrap_or(0.0);
// 解析搜索请求
let req: SearchRequest = match serde_json::from_str(request_json) {
Ok(r) => r,
Err(e) => {
return Err(JsValue::from_str(&format!("解析搜索请求失败: {}", e)));
}
};
// 解压缩搜索索引
let search_index = match from_compressed_with_max_version::<ArticleSearchIndex>(index_data, 9) {
Ok(idx) => idx,
Err(e) => {
return Err(JsValue::from_str(&format!("解压搜索索引失败: {}", e)));
}
};
// 执行搜索
let mut result = match req.search_type.as_str() {
"autocomplete" => perform_autocomplete(&search_index, &req),
_ => perform_search(&search_index, &req),
};
// 计算执行时间
let end_time = web_sys::window()
.and_then(|w| w.performance())
.map(|p| p.now())
.unwrap_or(0.0);
let time_ms = (end_time - start_time) as usize;
result.time_ms = time_ms;
// 序列化结果
match serde_json::to_string(&result) {
Ok(json) => Ok(json),
Err(e) => Err(JsValue::from_str(&format!("序列化搜索结果失败: {}", e))),
}
}
/// 分割查询为词条
fn split_query_to_terms(query: &str) -> Vec<String> {
let mut terms = Vec::new();
// 添加原始查询作为第一个词条
let clean_query = query.trim().to_lowercase();
if !clean_query.is_empty() {
terms.push(clean_query.clone());
}
// 不再分割查询为多个词条,简化搜索逻辑
// 如果需要,可以在这里添加分词逻辑
terms
}
/// 获取搜索建议
fn get_search_suggestions(search_index: &ArticleSearchIndex, query: &str) -> Vec<SearchSuggestion> {
let query = query.trim().to_lowercase();
// 如果查询为空,返回热门词汇
if query.is_empty() {
let mut common_terms: Vec<(String, usize)> = search_index.common_terms
.iter()
.map(|(term, freq)| (term.clone(), *freq))
.collect();
common_terms.sort_by(|a, b| b.1.cmp(&a.1)); // 按频率降序排序
return common_terms.iter().take(10).map(|(term, _)| {
SearchSuggestion {
text: term.clone(),
suggestion_type: SuggestionType::Completion,
matched_text: String::new(),
suggestion_text: term.clone(),
}
}).collect();
}
// 保存所有候选建议
let mut candidates: Vec<SuggestionCandidate> = Vec::new();
// 第1步: 标题完全匹配
for (_, article) in search_index.articles.iter().enumerate() {
let title_lower = article.title.to_lowercase();
if title_lower == query {
// 找到完全匹配标题的文章,不返回完全相同的建议
continue;
} else if title_lower.starts_with(&query) {
// 标题以查询开头,作为前缀补全
candidates.push(SuggestionCandidate {
text: article.title.clone(),
score: 100,
suggestion_type: SuggestionType::Completion,
frequency: 100
});
} else if title_lower.contains(&query) {
// 标题包含查询,作为纠正建议
candidates.push(SuggestionCandidate {
text: article.title.clone(),
score: 90,
suggestion_type: SuggestionType::Correction,
frequency: 90
});
}
}
// 第2步: 独立词汇匹配
for (term, freq) in &search_index.common_terms {
let term_lower = term.to_lowercase();
// 跳过与查询完全相同的词汇
if term_lower == query {
continue;
}
if term_lower.starts_with(&query) {
// 前缀匹配,作为补全建议
candidates.push(SuggestionCandidate {
text: term.clone(),
score: 95,
suggestion_type: SuggestionType::Completion,
frequency: *freq
});
} else if term_lower.contains(&query) {
// 包含关系,作为纠正建议
candidates.push(SuggestionCandidate {
text: term.clone(),
score: 85,
suggestion_type: SuggestionType::Correction,
frequency: *freq
});
}
}
// 第3步: 编辑距离匹配
if candidates.len() < 5 {
for (term, freq) in &search_index.common_terms {
let term_lower = term.to_lowercase();
// 跳过已添加的词汇和完全相同的词汇
if term_lower == query || candidates.iter().any(|s| s.text.to_lowercase() == term_lower) {
continue;
}
// 计算编辑距离
let distance = levenshtein_distance(&query, &term_lower);
// 只考虑编辑距离较小的词
let max_allowed_distance = query.len().min(3);
if distance <= max_allowed_distance as i32 {
// 编辑距离分数: 基础分80,减去距离值
let edit_score = 80 - distance * 5;
candidates.push(SuggestionCandidate {
text: term.clone(),
score: edit_score,
suggestion_type: SuggestionType::Correction,
frequency: *freq
});
}
}
}
// 首先按分数和频率排序
candidates.sort_by(|a, b| {
match b.score.cmp(&a.score) {
std::cmp::Ordering::Equal => b.frequency.cmp(&a.frequency),
other => other
}
});
// 转换为SearchSuggestion格式并截取前10个结果
candidates.iter()
.take(10)
.map(|candidate| {
let text_lower = candidate.text.to_lowercase();
let (matched_text, suggestion_text) = match candidate.suggestion_type {
SuggestionType::Completion if text_lower.starts_with(&query) => {
// 前缀匹配:分离已匹配部分和建议部分,保留原始大小写
let original_case_matched = &candidate.text[..query.len()];
let original_case_suggestion = &candidate.text[query.len()..];
(original_case_matched.to_string(), original_case_suggestion.to_string())
},
_ => {
// 纠正建议:用户输入作为匹配部分,完整建议作为建议部分
(query.to_string(), candidate.text.clone())
}
};
SearchSuggestion {
text: candidate.text.clone(),
suggestion_type: candidate.suggestion_type.clone(),
matched_text,
suggestion_text,
}
})
.collect()
}
/// 计算两个字符串之间的Levenshtein编辑距离
fn levenshtein_distance(s1: &str, s2: &str) -> i32 {
let s1_chars: Vec<char> = s1.chars().collect();
let s2_chars: Vec<char> = s2.chars().collect();
let m = s1_chars.len();
let n = s2_chars.len();
// 如果任何一个字符串为空,编辑距离就是另一个的长度
if m == 0 { return n as i32; }
if n == 0 { return m as i32; }
// 为动态规划创建距离矩阵
let mut matrix = vec![vec![0; n + 1]; m + 1];
// 初始化第一行和第一列
for i in 0..=m {
matrix[i][0] = i as i32;
}
for j in 0..=n {
matrix[0][j] = j as i32;
}
// 填充剩余的矩阵
for i in 1..=m {
for j in 1..=n {
let cost = if s1_chars[i-1] == s2_chars[j-1] { 0 } else { 1 };
matrix[i][j] = std::cmp::min(
std::cmp::min(
matrix[i-1][j] + 1, // 删除
matrix[i][j-1] + 1 // 插入
),
matrix[i-1][j-1] + cost // 替换
);
}
}
// 矩阵右下角的值就是编辑距离
matrix[m][n]
}
/// 执行自动补全
fn perform_autocomplete(search_index: &ArticleSearchIndex, req: &SearchRequest) -> SearchResult {
let query = req.query.to_lowercase();
// 如果查询为空,返回空结果
if query.is_empty() {
return SearchResult {
items: Vec::new(),
total: 0,
page: 1,
page_size: 10,
total_pages: 0,
time_ms: 0,
query: query.clone(),
suggestions: Vec::new(),
};
}
// 使用与普通搜索相同的建议生成逻辑
let suggestions = get_search_suggestions(search_index, &query);
SearchResult {
items: Vec::new(), // 自动补全不需要返回结果项
total: suggestions.len(),
page: 1,
page_size: suggestions.len(),
total_pages: 1,
time_ms: 0, // 由外部函数填充
query: query.clone(),
suggestions,
}
}
/// 执行搜索
fn perform_search(search_index: &ArticleSearchIndex, req: &SearchRequest) -> SearchResult {
let query = req.query.to_lowercase();
// 如果查询为空,返回空结果
if query.is_empty() {
return SearchResult {
items: Vec::new(),
total: 0,
page: req.page,
page_size: req.page_size,
total_pages: 0,
time_ms: 0,
query: query.clone(),
suggestions: Vec::new(),
};
}
// 分词 - 第一个词是完整查询
let terms = split_query_to_terms(&query);
if terms.is_empty() {
return SearchResult {
items: Vec::new(),
total: 0,
page: req.page,
page_size: req.page_size,
total_pages: 0,
time_ms: 0,
query: query.clone(),
suggestions: Vec::new(),
};
}
// 找到匹配的文章ID及其得分 - 已按匹配优先级排序
let matched_articles = find_matched_articles(search_index, &terms);
// 处理每个匹配的文章
let mut all_items = Vec::new();
for (article_id, base_score) in matched_articles {
if article_id >= search_index.articles.len() {
continue;
}
let article = &search_index.articles[article_id];
// 构建标题树和匹配内容
let heading_tree = build_heading_tree_with_matches(article, &terms, search_index);
// 高亮处理文章标题
let highlighted_title = if !terms.is_empty() {
highlight_title(&article.title, &terms[0])
} else {
article.title.clone()
};
// 创建搜索结果项
let result_item = SearchResultItem {
id: article.id.clone(),
title: highlighted_title,
summary: article.summary.clone(),
url: article.url.clone(),
score: base_score,
heading_tree,
page_type: article.page_type.clone(),
};
all_items.push(result_item);
}
// 分页处理
let total = all_items.len();
let total_pages = (total + req.page_size - 1) / req.page_size;
let start_idx = (req.page - 1) * req.page_size;
let end_idx = std::cmp::min(start_idx + req.page_size, total);
let paged_results = if start_idx < total {
all_items[start_idx..end_idx].to_vec()
} else {
Vec::new()
};
// 生成搜索建议
let suggestions = get_search_suggestions(search_index, &query);
SearchResult {
items: paged_results,
total,
page: req.page,
page_size: req.page_size,
total_pages,
time_ms: 0, // 由外部函数填充
query: query.clone(),
suggestions,
}
}
/// 高亮处理标题文本
fn highlight_title(title: &str, query: &str) -> String {
if title.is_empty() || query.is_empty() {
return title.to_string();
}
let title_lower = title.to_lowercase();
let query_lower = query.to_lowercase();
// 查找所有匹配位置
let mut term_positions = Vec::new();
let mut start_idx = 0;
while start_idx < title_lower.len() {
if let Some(found_idx) = title_lower[start_idx..].find(&query_lower) {
let abs_idx = start_idx + found_idx;
let match_end = abs_idx + query_lower.len();
// 确保索引位于字符边界上
let valid_abs_idx = find_char_boundary(title, abs_idx);
let valid_match_end = find_char_boundary(title, match_end);
// 添加匹配位置
if valid_match_end > valid_abs_idx {
term_positions.push((valid_abs_idx, valid_match_end));
}
start_idx = if valid_match_end > start_idx { valid_match_end } else { start_idx + 1 };
} else {
break;
}
}
// 如果没有找到匹配,返回原始标题
if term_positions.is_empty() {
return title.to_string();
}
// 按位置排序
term_positions.sort_by_key(|&(start, _)| start);
// 构建高亮标题
let mut highlighted = String::new();
let mut last_pos = 0;
for (start, end) in term_positions {
// 添加匹配前的文本
if start > last_pos {
highlighted.push_str(&title[last_pos..start]);
}
// 添加高亮标记
highlighted.push_str("<mark>");
highlighted.push_str(&title[start..end]);
highlighted.push_str("</mark>");
last_pos = end;
}
// 添加最后一部分
if last_pos < title.len() {
highlighted.push_str(&title[last_pos..]);
}
highlighted
}
/// 查找匹配的文章ID并按优先级排序
fn find_matched_articles(search_index: &ArticleSearchIndex, terms: &[String]) -> Vec<(usize, f64)> {
// 确保有搜索词
if terms.is_empty() {
return Vec::new();
}
let query = &terms[0].to_lowercase();
let mut result_with_scores: Vec<(usize, f64)> = Vec::new();
let mut seen_articles = HashSet::new();
// 第1步: 查找以查询开头的标题 (如"wasm入门指南")
for (article_id, article) in search_index.articles.iter().enumerate() {
let title_lower = article.title.to_lowercase();
if title_lower.starts_with(query) && title_lower != *query {
result_with_scores.push((article_id, 115.0));
seen_articles.insert(article_id);
}
}
// 第2步: 查找包含查询的标题 (如"使用wasm")
for (article_id, article) in search_index.articles.iter().enumerate() {
if seen_articles.contains(&article_id) {
continue;
}
let title_lower = article.title.to_lowercase();
if title_lower.contains(query) {
// 标题中包含查询词
result_with_scores.push((article_id, 99.0));
seen_articles.insert(article_id);
}
}
// 第3步: 查找标题与查询完全匹配的文章 (如只有"wasm")
for (article_id, article) in search_index.articles.iter().enumerate() {
if seen_articles.contains(&article_id) {
continue;
}
let title_lower = article.title.to_lowercase();
if title_lower == *query {
result_with_scores.push((article_id, 90.0));
seen_articles.insert(article_id);
}
}
// 第4步: 从索引中查找匹配
if let Some(article_ids) = search_index.title_term_index.get(query) {
for &article_id in article_ids {
if seen_articles.contains(&article_id) {
continue;
}
result_with_scores.push((article_id, 85.0));
seen_articles.insert(article_id);
}
}
// 第5步: 从标题关键词索引中查找
if let Some(heading_ids) = search_index.heading_term_index.get(query) {
for heading_id in heading_ids {
if let Some(article_id) = extract_article_id_from_heading(heading_id) {
if seen_articles.contains(&article_id) || article_id >= search_index.articles.len() {
continue;
}
result_with_scores.push((article_id, 80.0));
seen_articles.insert(article_id);
}
}
}
// 第6步: 从内容索引中查找
if let Some(article_ids) = search_index.content_term_index.get(query) {
for &article_id in article_ids {
if seen_articles.contains(&article_id) || article_id >= search_index.articles.len() {
continue;
}
result_with_scores.push((article_id, 75.0));
seen_articles.insert(article_id);
}
}
// 第7步: 如果没有找到任何匹配,尝试更宽松的匹配
if result_with_scores.is_empty() {
// 对所有文章内容进行更宽松的搜索
for (article_id, article) in search_index.articles.iter().enumerate() {
let content_lower = article.content.to_lowercase();
if content_lower.contains(query) {
result_with_scores.push((article_id, 50.0));
}
}
}
// 按分数降序排序
result_with_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
result_with_scores
}
/// 从标题ID中提取文章ID
fn extract_article_id_from_heading(heading_id: &str) -> Option<usize> {
// 标题ID的格式为 "article_id:heading_index"
if let Some(colon_pos) = heading_id.find(':') {
if let Some(article_id_str) = heading_id.get(0..colon_pos) {
return article_id_str.parse::<usize>().ok();
}
}
None
}
/// 在文章内容中查找匹配词,并提取段落上下文
fn find_matches_in_paragraph(article: &utils_common::models::ArticleMetadata, heading: &HeadingIndexEntry, terms: &[String]) -> Option<(String, Vec<String>)> {
// 提取标题下的内容,确保位置在有效的字符边界上
let mut content_start = heading.start_position + heading.text.len() + heading.level + 1; // +1 for the space
let mut content_end = heading.end_position;
// 确保起始位置是有效的字符边界
if content_start < article.content.len() {
content_start = find_char_boundary(&article.content, content_start);
}
// 确保结束位置是有效的字符边界
if content_end > article.content.len() {
content_end = article.content.len();
}
content_end = find_char_boundary(&article.content, content_end);
// 确保有效的内容
if content_start >= content_end || content_start >= article.content.len() {
return None;
}
// 提取标题下的内容
let content = &article.content[content_start..content_end];
// 如果内容为空则返回None
if content.trim().is_empty() {
return None;
}
// 在内容中查找匹配
let content_lower = content.to_lowercase();
let mut matched_terms = Vec::new();
let mut term_positions = Vec::new();
// 仅匹配完整查询
if !terms.is_empty() {
let complete_query = &terms[0].to_lowercase();
// 查找完整查询在内容中的所有位置
let mut start_idx = 0;
while start_idx < content_lower.len() {
if let Some(found_idx) = content_lower[start_idx..].find(complete_query) {
let abs_idx = start_idx + found_idx;
let match_end = abs_idx + complete_query.len();
// 确保索引位于字符边界上
let valid_abs_idx = find_char_boundary(content, abs_idx);
let valid_match_end = find_char_boundary(content, match_end);
// 确保匹配区域有效
if valid_match_end > valid_abs_idx {
// 添加匹配位置
term_positions.push((valid_abs_idx, valid_match_end, 1));
matched_terms.push(terms[0].clone());
}
// 继续搜索
start_idx = if valid_match_end > start_idx { valid_match_end } else { start_idx + 1 };
} else {
// 没有更多匹配
break;
}
}
}
// 如果没有匹配返回None
if term_positions.is_empty() {
return None;
}
// 对匹配位置排序(按位置)
term_positions.sort_by_key(|&(start, _, _)| start);
// 去除重复的匹配词
matched_terms.sort();
matched_terms.dedup();
// 格式化匹配内容
let highlighted_content = format_matched_content(content, &term_positions);
Some((highlighted_content, matched_terms))
}
/// 格式化匹配内容,高亮显示匹配词
fn format_matched_content(content: &str, term_positions: &[(usize, usize, i32)]) -> String {
// 如果没有匹配,返回原始内容
if term_positions.is_empty() || content.is_empty() {
return content.to_string();
}
let mut highlighted_content = String::new();
// 如果段落太长,我们只提取匹配词周围的上下文
if content.len() > 300 {
// 查找第一个高优先级匹配(通常是完整查询)
let primary_matches: Vec<&(usize, usize, i32)> = term_positions.iter()
.filter(|(_, _, prio)| *prio == 1)
.collect();
// 如果有高优先级匹配,使用它;否则使用第一个匹配
let (first_start, first_end, _) = if !primary_matches.is_empty() {
**primary_matches.first().unwrap()
} else {
term_positions[0]
};
// 安全地计算上下文起始位置,确保位于字符边界上
let mut ctx_start = if first_start > 150 { first_start - 150 } else { 0 };
ctx_start = find_char_boundary(content, ctx_start); // 确保在字符边界上
// 安全地计算上下文结束位置,确保位于字符边界上
let mut ctx_end = std::cmp::min(first_end + 150, content.len());
ctx_end = find_char_boundary(content, ctx_end); // 确保在字符边界上
// 获取上下文
let context = &content[ctx_start..ctx_end];
// 在上下文中高亮匹配词
let mut last_pos = 0;
// 只处理在上下文范围内的匹配
let visible_matches: Vec<(usize, usize)> = term_positions.iter()
.filter(|&&(s, e, _)| s >= ctx_start && e <= ctx_end)
.map(|&(s, e, _)| (s - ctx_start, e - ctx_start)) // 调整为相对位置
.collect();
for (rel_start, rel_end) in visible_matches {
// 添加匹配前的文本
if rel_start > last_pos && rel_start <= context.len() {
// 确保所有边界都是有效的
let safe_last_pos = find_char_boundary(context, last_pos);
let safe_rel_start = find_char_boundary(context, rel_start);
if safe_rel_start > safe_last_pos {
highlighted_content.push_str(&context[safe_last_pos..safe_rel_start]);
}
}
// 添加带标记的匹配文本
if rel_end <= context.len() {
let safe_rel_start = find_char_boundary(context, rel_start);
let safe_rel_end = find_char_boundary(context, rel_end);
if safe_rel_end > safe_rel_start {
highlighted_content.push_str("<mark>");
highlighted_content.push_str(&context[safe_rel_start..safe_rel_end]);
highlighted_content.push_str("</mark>");
}
last_pos = safe_rel_end;
}
}
// 添加最后一个匹配后的文本
if last_pos < context.len() {
let safe_last_pos = find_char_boundary(context, last_pos);
highlighted_content.push_str(&context[safe_last_pos..]);
}
// 如果上下文前后有截断,添加省略号
if ctx_start > 0 {
highlighted_content = format!("...{}", highlighted_content);
}
if ctx_end < content.len() {
highlighted_content = format!("{}...", highlighted_content);
}
} else {
// 对于短段落,显示整个内容
let mut last_pos = 0;
for &(start, end, _) in term_positions {
// 确保索引在有效范围内
if start < content.len() {
// 确保边界安全
let safe_start = find_char_boundary(content, start);
let safe_end = find_char_boundary(content, end.min(content.len()));
let safe_last_pos = find_char_boundary(content, last_pos);
// 添加匹配前的文本
if safe_start > safe_last_pos {
highlighted_content.push_str(&content[safe_last_pos..safe_start]);
}
// 添加带标记的匹配文本
if safe_end > safe_start {
highlighted_content.push_str("<mark>");
highlighted_content.push_str(&content[safe_start..safe_end]);
highlighted_content.push_str("</mark>");
}
last_pos = safe_end;
}
}
// 添加最后一个匹配后的文本
if last_pos < content.len() {
let safe_last_pos = find_char_boundary(content, last_pos);
highlighted_content.push_str(&content[safe_last_pos..]);
}
}
// 如果由于某种原因结果为空,返回原始内容的一部分
if highlighted_content.is_empty() && !content.is_empty() {
// 安全返回内容的前300个字符
let safe_end = find_char_boundary(content, content.len().min(300));
return format!("{}...", &content[0..safe_end]);
}
highlighted_content
}
/// 辅助函数:确保索引位于有效的字符边界上
fn find_char_boundary(s: &str, index: usize) -> usize {
// 确保边界值
if s.is_empty() {
return 0;
}
if index >= s.len() {
return s.len();
}
// 如果索引已经在字符边界上,直接返回
if s.is_char_boundary(index) {
return index;
}
// 否则,找到最近的字符边界
// 先向前查找
let mut previous = index;
while previous > 0 && !s.is_char_boundary(previous) {
previous -= 1;
}
// 向后查找
let mut next = index;
while next < s.len() && !s.is_char_boundary(next) {
next += 1;
}
// 返回最近的边界(前向或后向)
if index - previous <= next - index {
previous
} else {
next
}
}
/// 构建带匹配内容的标题树
fn build_heading_tree_with_matches(
article: &utils_common::models::ArticleMetadata,
terms: &[String],
search_index: &ArticleSearchIndex
) -> Option<HeadingNode> {
// 如果没有搜索词或内容为空返回None
if terms.is_empty() || article.content.is_empty() {
return None;
}
// 获取与文章相关的所有标题
let article_id_str = article.id.to_string();
let heading_map: HashMap<String, &HeadingIndexEntry> = search_index.heading_index.iter()
.filter(|(id, _)| id.starts_with(&format!("{}:", article_id_str)))
.map(|(id, entry)| (id.clone(), entry))
.collect();
if heading_map.is_empty() {
// 如果没有标题结构,创建一个根节点
let root_heading = HeadingIndexEntry {
id: format!("{}:root", article.id),
level: 0,
text: article.title.clone(),
start_position: 0,
end_position: article.content.len(),
parent_id: None,
children_ids: Vec::new(),
};
// 查找全文匹配
if let Some((highlighted_content, matched_terms)) = find_matches_in_paragraph(article, &root_heading, terms) {
return Some(HeadingNode {
id: root_heading.id,
text: root_heading.text,
level: root_heading.level,
content: Some(highlighted_content),
matched_terms: Some(matched_terms),
children: Vec::new(),
});
}
return None;
}
// 查找根标题(没有父标题的标题)
let mut root_headings: Vec<&&HeadingIndexEntry> = heading_map.values()
.filter(|entry| entry.parent_id.is_none())
.collect();
// 如果没有根标题返回None
if root_headings.is_empty() {
return None;
}
// 排序根标题,确保始终以相同的顺序处理
root_headings.sort_by_key(|entry| entry.start_position);
// 创建一个虚拟的根节点来包含所有顶级标题
let root_heading = HeadingIndexEntry {
id: format!("{}:root", article.id),
level: 0,
text: article.title.clone(),
start_position: 0,
end_position: article.content.len(),
parent_id: None,
children_ids: root_headings.iter().map(|entry| entry.id.clone()).collect(),
};
// 先查找每个段落中的匹配
let mut heading_matches: HashMap<String, (String, Vec<String>)> = HashMap::new();
// 处理所有标题下的匹配
for (heading_id, heading) in &heading_map {
if let Some((highlighted_content, matched_terms)) = find_matches_in_paragraph(article, heading, terms) {
heading_matches.insert(heading_id.clone(), (highlighted_content, matched_terms));
}
}
// 处理根节点下的直接内容(不属于任何标题的部分)
let root_content = if let Some((highlighted_content, matched_terms)) = find_matches_in_paragraph(article, &root_heading, terms) {
Some((highlighted_content, matched_terms))
} else {
None
};
// 创建根节点
let mut root_node = HeadingNode {
id: root_heading.id,
text: root_heading.text,
level: root_heading.level,
content: root_content.as_ref().map(|(content, _)| content.clone()),
matched_terms: root_content.as_ref().map(|(_, terms)| terms.clone()),
children: Vec::new(),
};
// 递归构建子标题树
for child_id in &root_heading.children_ids {
if let Some(heading) = heading_map.get(child_id) {
let mut child_node = HeadingNode {
id: child_id.clone(),
text: heading.text.clone(),
level: heading.level,
content: None,
matched_terms: None,
children: Vec::new(),
};
// 填充子节点的匹配内容和子节点
if let Some((content, terms)) = heading_matches.get(child_id) {
child_node.content = Some(content.clone());
child_node.matched_terms = Some(terms.clone());
}
// 递归处理子标题
if !heading.children_ids.is_empty() {
for grandchild_id in &heading.children_ids {
if let Some(grandchild) = heading_map.get(grandchild_id) {
let mut grandchild_node = HeadingNode {
id: grandchild_id.clone(),
text: grandchild.text.clone(),
level: grandchild.level,
content: None,
matched_terms: None,
children: Vec::new(),
};
// 填充孙节点的匹配内容
if let Some((content, terms)) = heading_matches.get(grandchild_id) {
grandchild_node.content = Some(content.clone());
grandchild_node.matched_terms = Some(terms.clone());
}
// 对于更深层次的节点,采用相同的处理方式
if !grandchild.children_ids.is_empty() {
process_deeper_nodes(&mut grandchild_node, grandchild, &heading_map, &heading_matches);
}
child_node.children.push(grandchild_node);
}
}
// 按标题文本排序子节点,保持一致性
child_node.children.sort_by(|a, b| a.text.cmp(&b.text));
}
root_node.children.push(child_node);
}
}
// 按级别和文本排序子节点
root_node.children.sort_by(|a, b| {
match a.level.cmp(&b.level) {
std::cmp::Ordering::Equal => a.text.cmp(&b.text),
other => other
}
});
Some(root_node)
}
/// 处理更深层次的标题节点
fn process_deeper_nodes(
parent: &mut HeadingNode,
heading: &HeadingIndexEntry,
heading_map: &HashMap<String, &HeadingIndexEntry>,
heading_matches: &HashMap<String, (String, Vec<String>)>
) {
for child_id in &heading.children_ids {
if let Some(child) = heading_map.get(child_id) {
let mut child_node = HeadingNode {
id: child_id.clone(),
text: child.text.clone(),
level: child.level,
content: None,
matched_terms: None,
children: Vec::new(),
};
// 填充匹配内容
if let Some((content, terms)) = heading_matches.get(child_id) {
child_node.content = Some(content.clone());
child_node.matched_terms = Some(terms.clone());
}
// 继续处理子节点
if !child.children_ids.is_empty() {
process_deeper_nodes(&mut child_node, child, heading_map, heading_matches);
}
parent.children.push(child_node);
}
}
// 按级别和文本排序子节点
parent.children.sort_by(|a, b| {
match a.level.cmp(&b.level) {
std::cmp::Ordering::Equal => a.text.cmp(&b.text),
other => other
}
});
}