Files
hcapEnv/analyze_new.py
2026-02-21 18:27:49 +08:00

214 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
专用解析脚本chatgpt.com-*.log 格式
每行结构: hsw.js:2 {"tag":"索引点","tH":N,"Ig":"..."}
Ig 值含义:被检测的浏览器 API 构造函数名 / 属性名 / 返回值
"""
import re
import json
import sys
import glob
from collections import defaultdict, OrderedDict
# ── 自动找日志文件 ──────────────────────────────────────────
def find_log(path_arg=None):
if path_arg:
return path_arg
candidates = sorted(glob.glob("/home/carry/myprj/hcaptcha/asset/chatgpt.com-*.log"))
if not candidates:
print("❌ 未找到 chatgpt.com-*.log请手动传入路径")
sys.exit(1)
return candidates[-1] # 取最新的
# ── 解析 ────────────────────────────────────────────────────
def parse(path):
entries = []
with open(path, encoding="utf-8") as f:
for lineno, line in enumerate(f, 1):
line = line.strip()
m = re.match(r'hsw\.js:\d+\s+(.*)', line)
if not m:
continue
body = m.group(1).strip()
if body.startswith('{'):
try:
obj = json.loads(body)
if obj.get("tag") == "索引点":
entries.append({
"lineno": lineno,
"tH": obj["tH"],
"has_ig": "Ig" in obj,
"ig": obj.get("Ig"), # 可能是 str/int/bool/None
})
except json.JSONDecodeError:
pass
return entries
# ── 汇总 ────────────────────────────────────────────────────
def summarize(entries):
"""
对每个 tH按出现顺序收集所有 Ig 值(去重保序)。
分类:
- has_value : Ig 有实际内容
- no_ig : 完全没有 Ig 字段
"""
tH_igs = defaultdict(list) # tH -> [ig, ...](有序去重后)
tH_no_ig = defaultdict(int) # tH -> 出现次数(无 Ig 的)
tH_lines = defaultdict(list) # tH -> 首次出现行号
seen = defaultdict(set) # 用于 Ig 去重
for e in entries:
tH = e["tH"]
tH_lines[tH].append(e["lineno"])
if e["has_ig"]:
ig = e["ig"]
key = repr(ig)
if key not in seen[tH]:
seen[tH].add(key)
tH_igs[tH].append(ig)
else:
tH_no_ig[tH] += 1
return tH_igs, tH_no_ig, tH_lines
# ── 打印报告 ─────────────────────────────────────────────────
def report(tH_igs, tH_no_ig, tH_lines):
all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
print("=" * 68)
print(" HSW 新日志分析 — 每个索引点(tH)访问的浏览器 API")
print("=" * 68)
# 分组输出
has_value = []
only_no_ig = []
for tH in all_tH:
igs = tH_igs.get(tH, [])
no = tH_no_ig.get(tH, 0)
if igs:
has_value.append((tH, igs, no))
else:
only_no_ig.append((tH, no))
# ── 有值的 tH ──
print(f"\n✅ 有 Ig 值的索引点 ({len(has_value)} 个)\n")
print(f" {'tH':<6} {'Ig 值(去重、按出现顺序)'}")
print(f" {''*6} {''*56}")
for tH, igs, no_cnt in has_value:
# 格式化 Ig 列表
parts = []
for v in igs:
if isinstance(v, str) and len(v) > 60:
parts.append(v[:57] + "...")
else:
parts.append(repr(v) if not isinstance(v, str) else v)
ig_str = " | ".join(parts)
suffix = f" (另有 {no_cnt} 次无Ig)" if no_cnt else ""
print(f" tH={tH:<4d} {ig_str}{suffix}")
# ── 只有 no_ig 的 tH ──
print(f"\n🟠 仅无 Ig 字段的索引点 ({len(only_no_ig)} 个) ← void 路径或未命中\n")
print(f" {'tH':<6} {'出现次数'}")
print(f" {''*6} {''*10}")
for tH, cnt in only_no_ig:
print(f" tH={tH:<4d} {cnt}")
# ── 按 API 类别归纳 ──
print(f"\n{''*68}")
print(" 📋 API 检测归纳(每个 tH 在检测什么)")
print(f"{''*68}\n")
# 已知含义映射(根据常见 hCaptcha 指纹逻辑)
known = {
"Window": "全局 window 对象",
"Promise": "Promise 构造函数检测",
"Object": "Object 原型检测",
"Performance": "performance API",
"performance": "window.performance 属性",
"Crypto": "window.crypto API",
"Uint8Array": "TypedArray (crypto.getRandomValues)",
"OfflineAudioContext": "AudioContext 指纹",
"RTCPeerConnection": "WebRTC 检测",
"fetch": "fetch API 检测",
"Request": "fetch Request 构造函数",
"Screen": "screen 对象",
"Storage": "localStorage / sessionStorage",
"IDBFactory": "indexedDB",
"HTMLDocument": "document 类型",
"HTMLCanvasElement": "Canvas 元素检测",
"CanvasRenderingContext2D": "2D Canvas 渲染上下文",
"Navigator": "navigator 对象",
"webdriver": "navigator.webdriver 检测bot检测关键",
"languages": "navigator.languages",
"Array": "Array 类型检测",
"getEntriesByType": "performance.getEntriesByType 方法",
"prototype": "原型链检测",
"constructor": "constructor 属性验证",
"__wdata": "window 属性枚举(环境指纹)",
"#000000": "Canvas fillStyle 默认值",
}
for tH, igs, _ in has_value:
descs = []
for v in igs:
if isinstance(v, str):
d = known.get(v)
if d:
descs.append(f"{v}{d}")
elif v.startswith("0,1,2,3"):
descs.append("window keys 枚举列表 → 全局属性指纹")
elif re.match(r'\d+:\d+:\d{4}', v):
descs.append(f"{v} → HSW token 格式")
elif v in ("f", "t", "c", "d"):
descs.append(f'"{v}" → 分支标记字符')
else:
descs.append(v)
elif isinstance(v, bool):
descs.append(f"{v} → 布尔检测结果")
elif isinstance(v, int):
descs.append(f"{v} → 数值")
print(f" tH={tH:<4d}:")
for d in descs:
print(f" {d}")
print()
# ── 导出 JSON ────────────────────────────────────────────────
def export_json(tH_igs, tH_no_ig, out_path):
result = OrderedDict()
all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
for tH in all_tH:
igs = tH_igs.get(tH, [])
no = tH_no_ig.get(tH, 0)
result[str(tH)] = {
"ig_values": [v if not isinstance(v, str) or len(v) <= 200 else v[:200]+"..." for v in igs],
"no_ig_count": no,
"status": "has_value" if igs else "no_ig",
}
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"📄 JSON 已写入: {out_path}")
# ── 入口 ─────────────────────────────────────────────────────
if __name__ == "__main__":
log_path = find_log(sys.argv[1] if len(sys.argv) > 1 else None)
print(f"📂 日志文件: {log_path}\n")
entries = parse(log_path)
print(f"共解析 {len(entries)} 条索引点记录\n")
tH_igs, tH_no_ig, tH_lines = summarize(entries)
report(tH_igs, tH_no_ig, tH_lines)
out = log_path.replace(".log", "_analysis.json")
export_json(tH_igs, tH_no_ig, out)