#!/usr/bin/env python3 """ 专用解析脚本:chatgpt.com-*.log 格式 每行结构: hsw.js:2 {"tag":"索引点","tH":N,"Ig":"..."} Ig 值含义:被检测的浏览器 API 构造函数名 / 属性名 / 返回值 """ import re import json import sys import glob from collections import defaultdict, OrderedDict # ── 自动找日志文件 ────────────────────────────────────────── def find_log(path_arg=None): if path_arg: return path_arg candidates = sorted(glob.glob("/home/carry/myprj/hcaptcha/asset/chatgpt.com-*.log")) if not candidates: print("❌ 未找到 chatgpt.com-*.log,请手动传入路径") sys.exit(1) return candidates[-1] # 取最新的 # ── 解析 ──────────────────────────────────────────────────── def parse(path): entries = [] with open(path, encoding="utf-8") as f: for lineno, line in enumerate(f, 1): line = line.strip() m = re.match(r'hsw\.js:\d+\s+(.*)', line) if not m: continue body = m.group(1).strip() if body.startswith('{'): try: obj = json.loads(body) if obj.get("tag") == "索引点": entries.append({ "lineno": lineno, "tH": obj["tH"], "has_ig": "Ig" in obj, "ig": obj.get("Ig"), # 可能是 str/int/bool/None }) except json.JSONDecodeError: pass return entries # ── 汇总 ──────────────────────────────────────────────────── def summarize(entries): """ 对每个 tH,按出现顺序收集所有 Ig 值(去重保序)。 分类: - has_value : Ig 有实际内容 - no_ig : 完全没有 Ig 字段 """ tH_igs = defaultdict(list) # tH -> [ig, ...](有序去重后) tH_no_ig = defaultdict(int) # tH -> 出现次数(无 Ig 的) tH_lines = defaultdict(list) # tH -> 首次出现行号 seen = defaultdict(set) # 用于 Ig 去重 for e in entries: tH = e["tH"] tH_lines[tH].append(e["lineno"]) if e["has_ig"]: ig = e["ig"] key = repr(ig) if key not in seen[tH]: seen[tH].add(key) tH_igs[tH].append(ig) else: tH_no_ig[tH] += 1 return tH_igs, tH_no_ig, tH_lines # ── 打印报告 ───────────────────────────────────────────────── def report(tH_igs, tH_no_ig, tH_lines): all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys()))) print("=" * 68) print(" HSW 新日志分析 — 每个索引点(tH)访问的浏览器 API") print("=" * 68) # 分组输出 has_value = [] only_no_ig = [] for tH in all_tH: igs = tH_igs.get(tH, []) no = tH_no_ig.get(tH, 0) if igs: has_value.append((tH, igs, no)) else: only_no_ig.append((tH, no)) # ── 有值的 tH ── print(f"\n✅ 有 Ig 值的索引点 ({len(has_value)} 个)\n") print(f" {'tH':<6} {'Ig 值(去重、按出现顺序)'}") print(f" {'─'*6} {'─'*56}") for tH, igs, no_cnt in has_value: # 格式化 Ig 列表 parts = [] for v in igs: if isinstance(v, str) and len(v) > 60: parts.append(v[:57] + "...") else: parts.append(repr(v) if not isinstance(v, str) else v) ig_str = " | ".join(parts) suffix = f" (另有 {no_cnt} 次无Ig)" if no_cnt else "" print(f" tH={tH:<4d} {ig_str}{suffix}") # ── 只有 no_ig 的 tH ── print(f"\n🟠 仅无 Ig 字段的索引点 ({len(only_no_ig)} 个) ← void 路径或未命中\n") print(f" {'tH':<6} {'出现次数'}") print(f" {'─'*6} {'─'*10}") for tH, cnt in only_no_ig: print(f" tH={tH:<4d} {cnt} 次") # ── 按 API 类别归纳 ── print(f"\n{'─'*68}") print(" 📋 API 检测归纳(每个 tH 在检测什么)") print(f"{'─'*68}\n") # 已知含义映射(根据常见 hCaptcha 指纹逻辑) known = { "Window": "全局 window 对象", "Promise": "Promise 构造函数检测", "Object": "Object 原型检测", "Performance": "performance API", "performance": "window.performance 属性", "Crypto": "window.crypto API", "Uint8Array": "TypedArray (crypto.getRandomValues)", "OfflineAudioContext": "AudioContext 指纹", "RTCPeerConnection": "WebRTC 检测", "fetch": "fetch API 检测", "Request": "fetch Request 构造函数", "Screen": "screen 对象", "Storage": "localStorage / sessionStorage", "IDBFactory": "indexedDB", "HTMLDocument": "document 类型", "HTMLCanvasElement": "Canvas 元素检测", "CanvasRenderingContext2D": "2D Canvas 渲染上下文", "Navigator": "navigator 对象", "webdriver": "navigator.webdriver 检测(bot检测关键)", "languages": "navigator.languages", "Array": "Array 类型检测", "getEntriesByType": "performance.getEntriesByType 方法", "prototype": "原型链检测", "constructor": "constructor 属性验证", "__wdata": "window 属性枚举(环境指纹)", "#000000": "Canvas fillStyle 默认值", } for tH, igs, _ in has_value: descs = [] for v in igs: if isinstance(v, str): d = known.get(v) if d: descs.append(f"{v} → {d}") elif v.startswith("0,1,2,3"): descs.append("window keys 枚举列表 → 全局属性指纹") elif re.match(r'\d+:\d+:\d{4}', v): descs.append(f"{v} → HSW token 格式") elif v in ("f", "t", "c", "d"): descs.append(f'"{v}" → 分支标记字符') else: descs.append(v) elif isinstance(v, bool): descs.append(f"{v} → 布尔检测结果") elif isinstance(v, int): descs.append(f"{v} → 数值") print(f" tH={tH:<4d}:") for d in descs: print(f" {d}") print() # ── 导出 JSON ──────────────────────────────────────────────── def export_json(tH_igs, tH_no_ig, out_path): result = OrderedDict() all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys()))) for tH in all_tH: igs = tH_igs.get(tH, []) no = tH_no_ig.get(tH, 0) result[str(tH)] = { "ig_values": [v if not isinstance(v, str) or len(v) <= 200 else v[:200]+"..." for v in igs], "no_ig_count": no, "status": "has_value" if igs else "no_ig", } with open(out_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"📄 JSON 已写入: {out_path}") # ── 入口 ───────────────────────────────────────────────────── if __name__ == "__main__": log_path = find_log(sys.argv[1] if len(sys.argv) > 1 else None) print(f"📂 日志文件: {log_path}\n") entries = parse(log_path) print(f"共解析 {len(entries)} 条索引点记录\n") tH_igs, tH_no_ig, tH_lines = summarize(entries) report(tH_igs, tH_no_ig, tH_lines) out = log_path.replace(".log", "_analysis.json") export_json(tH_igs, tH_no_ig, out)