Files
Claude-Code-Workflow/archive/compare_search.py
catlog22 874b70726d chore: archive unused test scripts and temporary documents
- Moved 6 empty test files (*.test.ts) to archive/
- Moved 5 Python test scripts (*.py) to archive/
- Moved 5 outdated/temporary documents to archive/
- Cleaned up root directory for better organization
2026-01-24 21:26:03 +08:00

120 lines
3.6 KiB
Python

#!/usr/bin/env python
"""Compare search results: Hybrid vs Cascade with Reranker."""
import subprocess
import sys
import os
import re
import json
os.chdir(r"D:\dongdiankaifa9\hydro_generator_module")
query = "热网络计算"
ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
def run_search(method: str) -> dict:
"""Run search and return parsed result dict."""
cmd = [sys.executable, "-m", "codexlens", "search", query,
"--method", method, "--limit", "10", "--json"]
result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8")
# Strip ANSI codes
output = ansi_escape.sub('', result.stdout + result.stderr)
# Find and parse JSON (properly handle nested structures)
start = output.find('{')
if start < 0:
return {"success": False, "error": "No JSON found"}
# Count braces properly, handling strings
in_string = False
escaped = False
depth = 0
end_idx = start
for i, c in enumerate(output[start:]):
if escaped:
escaped = False
continue
if c == '\\':
escaped = True
continue
if c == '"' and not escaped:
in_string = not in_string
continue
if not in_string:
if c == '{':
depth += 1
elif c == '}':
depth -= 1
if depth == 0:
end_idx = start + i + 1
break
try:
return json.loads(output[start:end_idx])
except Exception as e:
return {"success": False, "error": str(e)}
print("=" * 75)
print(f"搜索对比: Hybrid vs Cascade")
print(f"查询: {query}")
print("=" * 75)
# Hybrid search (no cross-encoder reranking)
print("\n[1] Hybrid 搜索 (无 Cross-Encoder Reranker):")
print("-" * 75)
hybrid_result = run_search("hybrid")
hybrid_files = []
if hybrid_result.get("success"):
results = hybrid_result.get("result", {}).get("results", [])[:10]
for i, r in enumerate(results, 1):
name = os.path.basename(r.get("path", ""))
score = r.get("score", 0)
hybrid_files.append(name)
print(f"{i:2}. {name:<45} score={score:.4f}")
else:
print("搜索失败:", hybrid_result.get("error"))
# Cascade search (with cross-encoder reranking when strategy=hybrid)
print("\n[2] Cascade 搜索 (使用 Cross-Encoder Reranker):")
print("-" * 75)
cascade_result = run_search("cascade")
cascade_files = []
if cascade_result.get("success"):
results = cascade_result.get("result", {}).get("results", [])[:10]
for i, r in enumerate(results, 1):
name = os.path.basename(r.get("path", ""))
score = r.get("score", 0)
cascade_files.append(name)
print(f"{i:2}. {name:<45} score={score:.4f}")
else:
print("搜索失败:", cascade_result.get("error"))
# Compare ranking changes
print("\n[3] 排名变化分析:")
print("-" * 75)
changes = []
for i, name in enumerate(cascade_files):
if name in hybrid_files:
old_pos = hybrid_files.index(name) + 1
new_pos = i + 1
if old_pos != new_pos:
direction = "" if new_pos < old_pos else ""
changes.append(f" {name}: #{old_pos} → #{new_pos} {direction}")
else:
changes.append(f" {name}: NEW (不在 Hybrid 前10)")
if changes:
print("Reranker 排序变化:")
for c in changes:
print(c)
else:
print("排序相同 (无变化)")
print("\n" + "=" * 75)
print("配置说明:")
print("- Hybrid: FTS + Vector 融合 (无二次精排)")
print("- Cascade: 粗筛 + Cross-Encoder Reranker 精排")
print("- Reranker: Qwen/Qwen3-Reranker-8B via SiliconFlow API")
print("=" * 75)