415gotit

2026-02-21 18:27:49 +08:00
parent 0ac4b23f07
commit 5dc86ccfbf
270 changed files with 49508 additions and 4636 deletions
--- a/analyze_new.py
+++ b/analyze_new.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+专用解析脚本：chatgpt.com-*.log 格式
+每行结构: hsw.js:2 {"tag":"索引点","tH":N,"Ig":"..."}
+Ig 值含义：被检测的浏览器 API 构造函数名 / 属性名 / 返回值
+"""
+
+import re
+import json
+import sys
+import glob
+from collections import defaultdict, OrderedDict
+
+# ── 自动找日志文件 ──────────────────────────────────────────
+def find_log(path_arg=None):
+    if path_arg:
+        return path_arg
+    candidates = sorted(glob.glob("/home/carry/myprj/hcaptcha/asset/chatgpt.com-*.log"))
+    if not candidates:
+        print("❌ 未找到 chatgpt.com-*.log，请手动传入路径")
+        sys.exit(1)
+    return candidates[-1]   # 取最新的
+
+
+# ── 解析 ────────────────────────────────────────────────────
+def parse(path):
+    entries = []
+    with open(path, encoding="utf-8") as f:
+        for lineno, line in enumerate(f, 1):
+            line = line.strip()
+            m = re.match(r'hsw\.js:\d+\s+(.*)', line)
+            if not m:
+                continue
+            body = m.group(1).strip()
+            if body.startswith('{'):
+                try:
+                    obj = json.loads(body)
+                    if obj.get("tag") == "索引点":
+                        entries.append({
+                            "lineno": lineno,
+                            "tH":     obj["tH"],
+                            "has_ig": "Ig" in obj,
+                            "ig":     obj.get("Ig"),   # 可能是 str/int/bool/None
+                        })
+                except json.JSONDecodeError:
+                    pass
+    return entries
+
+
+# ── 汇总 ────────────────────────────────────────────────────
+def summarize(entries):
+    """
+    对每个 tH，按出现顺序收集所有 Ig 值（去重保序）。
+    分类：
+      - has_value : Ig 有实际内容
+      - no_ig     : 完全没有 Ig 字段
+    """
+    tH_igs   = defaultdict(list)    # tH -> [ig, ...]（有序去重后）
+    tH_no_ig = defaultdict(int)     # tH -> 出现次数（无 Ig 的）
+    tH_lines = defaultdict(list)    # tH -> 首次出现行号
+
+    seen = defaultdict(set)         # 用于 Ig 去重
+
+    for e in entries:
+        tH = e["tH"]
+        tH_lines[tH].append(e["lineno"])
+
+        if e["has_ig"]:
+            ig = e["ig"]
+            key = repr(ig)
+            if key not in seen[tH]:
+                seen[tH].add(key)
+                tH_igs[tH].append(ig)
+        else:
+            tH_no_ig[tH] += 1
+
+    return tH_igs, tH_no_ig, tH_lines
+
+
+# ── 打印报告 ─────────────────────────────────────────────────
+def report(tH_igs, tH_no_ig, tH_lines):
+    all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
+
+    print("=" * 68)
+    print("  HSW 新日志分析  —  每个索引点(tH)访问的浏览器 API")
+    print("=" * 68)
+
+    # 分组输出
+    has_value = []
+    only_no_ig = []
+
+    for tH in all_tH:
+        igs = tH_igs.get(tH, [])
+        no  = tH_no_ig.get(tH, 0)
+        if igs:
+            has_value.append((tH, igs, no))
+        else:
+            only_no_ig.append((tH, no))
+
+    # ── 有值的 tH ──
+    print(f"\n✅  有 Ig 值的索引点 ({len(has_value)} 个)\n")
+    print(f"  {'tH':<6}  {'Ig 值（去重、按出现顺序）'}")
+    print(f"  {'─'*6}  {'─'*56}")
+    for tH, igs, no_cnt in has_value:
+        # 格式化 Ig 列表
+        parts = []
+        for v in igs:
+            if isinstance(v, str) and len(v) > 60:
+                parts.append(v[:57] + "...")
+            else:
+                parts.append(repr(v) if not isinstance(v, str) else v)
+        ig_str = "  |  ".join(parts)
+        suffix = f"  (另有 {no_cnt} 次无Ig)" if no_cnt else ""
+        print(f"  tH={tH:<4d}  {ig_str}{suffix}")
+
+    # ── 只有 no_ig 的 tH ──
+    print(f"\n🟠  仅无 Ig 字段的索引点 ({len(only_no_ig)} 个)  ← void 路径或未命中\n")
+    print(f"  {'tH':<6}  {'出现次数'}")
+    print(f"  {'─'*6}  {'─'*10}")
+    for tH, cnt in only_no_ig:
+        print(f"  tH={tH:<4d}  {cnt} 次")
+
+    # ── 按 API 类别归纳 ──
+    print(f"\n{'─'*68}")
+    print("  📋  API 检测归纳（每个 tH 在检测什么）")
+    print(f"{'─'*68}\n")
+
+    # 已知含义映射（根据常见 hCaptcha 指纹逻辑）
+    known = {
+        "Window":                   "全局 window 对象",
+        "Promise":                  "Promise 构造函数检测",
+        "Object":                   "Object 原型检测",
+        "Performance":              "performance API",
+        "performance":              "window.performance 属性",
+        "Crypto":                   "window.crypto API",
+        "Uint8Array":               "TypedArray (crypto.getRandomValues)",
+        "OfflineAudioContext":      "AudioContext 指纹",
+        "RTCPeerConnection":        "WebRTC 检测",
+        "fetch":                    "fetch API 检测",
+        "Request":                  "fetch Request 构造函数",
+        "Screen":                   "screen 对象",
+        "Storage":                  "localStorage / sessionStorage",
+        "IDBFactory":               "indexedDB",
+        "HTMLDocument":             "document 类型",
+        "HTMLCanvasElement":        "Canvas 元素检测",
+        "CanvasRenderingContext2D": "2D Canvas 渲染上下文",
+        "Navigator":                "navigator 对象",
+        "webdriver":                "navigator.webdriver 检测（bot检测关键）",
+        "languages":                "navigator.languages",
+        "Array":                    "Array 类型检测",
+        "getEntriesByType":         "performance.getEntriesByType 方法",
+        "prototype":                "原型链检测",
+        "constructor":              "constructor 属性验证",
+        "__wdata":                  "window 属性枚举（环境指纹）",
+        "#000000":                  "Canvas fillStyle 默认值",
+    }
+
+    for tH, igs, _ in has_value:
+        descs = []
+        for v in igs:
+            if isinstance(v, str):
+                d = known.get(v)
+                if d:
+                    descs.append(f"{v} → {d}")
+                elif v.startswith("0,1,2,3"):
+                    descs.append("window keys 枚举列表 → 全局属性指纹")
+                elif re.match(r'\d+:\d+:\d{4}', v):
+                    descs.append(f"{v} → HSW token 格式")
+                elif v in ("f", "t", "c", "d"):
+                    descs.append(f'"{v}" → 分支标记字符')
+                else:
+                    descs.append(v)
+            elif isinstance(v, bool):
+                descs.append(f"{v} → 布尔检测结果")
+            elif isinstance(v, int):
+                descs.append(f"{v} → 数值")
+
+        print(f"  tH={tH:<4d}:")
+        for d in descs:
+            print(f"           {d}")
+        print()
+
+
+# ── 导出 JSON ────────────────────────────────────────────────
+def export_json(tH_igs, tH_no_ig, out_path):
+    result = OrderedDict()
+    all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
+    for tH in all_tH:
+        igs = tH_igs.get(tH, [])
+        no  = tH_no_ig.get(tH, 0)
+        result[str(tH)] = {
+            "ig_values": [v if not isinstance(v, str) or len(v) <= 200 else v[:200]+"..." for v in igs],
+            "no_ig_count": no,
+            "status": "has_value" if igs else "no_ig",
+        }
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+    print(f"📄 JSON 已写入: {out_path}")
+
+
+# ── 入口 ─────────────────────────────────────────────────────
+if __name__ == "__main__":
+    log_path = find_log(sys.argv[1] if len(sys.argv) > 1 else None)
+    print(f"📂 日志文件: {log_path}\n")
+
+    entries = parse(log_path)
+    print(f"共解析 {len(entries)} 条索引点记录\n")
+
+    tH_igs, tH_no_ig, tH_lines = summarize(entries)
+    report(tH_igs, tH_no_ig, tH_lines)
+
+    out = log_path.replace(".log", "_analysis.json")
+    export_json(tH_igs, tH_no_ig, out)