hcapEnv/analyze_new.py

#!/usr/bin/env python3
"""
专用解析脚本：chatgpt.com-*.log 格式
每行结构: hsw.js:2 {"tag":"索引点","tH":N,"Ig":"..."}
Ig 值含义：被检测的浏览器 API 构造函数名 / 属性名 / 返回值
"""

import re
import json
import sys
import glob
from collections import defaultdict, OrderedDict

# ── 自动找日志文件 ──────────────────────────────────────────
def find_log(path_arg=None):
    if path_arg:
        return path_arg
    candidates = sorted(glob.glob("/home/carry/myprj/hcaptcha/asset/chatgpt.com-*.log"))
    if not candidates:
        print("❌ 未找到 chatgpt.com-*.log，请手动传入路径")
        sys.exit(1)
    return candidates[-1]   # 取最新的


# ── 解析 ────────────────────────────────────────────────────
def parse(path):
    entries = []
    with open(path, encoding="utf-8") as f:
        for lineno, line in enumerate(f, 1):
            line = line.strip()
            m = re.match(r'hsw\.js:\d+\s+(.*)', line)
            if not m:
                continue
            body = m.group(1).strip()
            if body.startswith('{'):
                try:
                    obj = json.loads(body)
                    if obj.get("tag") == "索引点":
                        entries.append({
                            "lineno": lineno,
                            "tH":     obj["tH"],
                            "has_ig": "Ig" in obj,
                            "ig":     obj.get("Ig"),   # 可能是 str/int/bool/None
                        })
                except json.JSONDecodeError:
                    pass
    return entries


# ── 汇总 ────────────────────────────────────────────────────
def summarize(entries):
    """
    对每个 tH，按出现顺序收集所有 Ig 值（去重保序）。
    分类：
      - has_value : Ig 有实际内容
      - no_ig     : 完全没有 Ig 字段
    """
    tH_igs   = defaultdict(list)    # tH -> [ig, ...]（有序去重后）
    tH_no_ig = defaultdict(int)     # tH -> 出现次数（无 Ig 的）
    tH_lines = defaultdict(list)    # tH -> 首次出现行号

    seen = defaultdict(set)         # 用于 Ig 去重

    for e in entries:
        tH = e["tH"]
        tH_lines[tH].append(e["lineno"])

        if e["has_ig"]:
            ig = e["ig"]
            key = repr(ig)
            if key not in seen[tH]:
                seen[tH].add(key)
                tH_igs[tH].append(ig)
        else:
            tH_no_ig[tH] += 1

    return tH_igs, tH_no_ig, tH_lines


# ── 打印报告 ─────────────────────────────────────────────────
def report(tH_igs, tH_no_ig, tH_lines):
    all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))

    print("=" * 68)
    print("  HSW 新日志分析  —  每个索引点(tH)访问的浏览器 API")
    print("=" * 68)

    # 分组输出
    has_value = []
    only_no_ig = []

    for tH in all_tH:
        igs = tH_igs.get(tH, [])
        no  = tH_no_ig.get(tH, 0)
        if igs:
            has_value.append((tH, igs, no))
        else:
            only_no_ig.append((tH, no))

    # ── 有值的 tH ──
    print(f"\n✅  有 Ig 值的索引点 ({len(has_value)} 个)\n")
    print(f"  {'tH':<6}  {'Ig 值（去重、按出现顺序）'}")
    print(f"  {'─'*6}  {'─'*56}")
    for tH, igs, no_cnt in has_value:
        # 格式化 Ig 列表
        parts = []
        for v in igs:
            if isinstance(v, str) and len(v) > 60:
                parts.append(v[:57] + "...")
            else:
                parts.append(repr(v) if not isinstance(v, str) else v)
        ig_str = "  |  ".join(parts)
        suffix = f"  (另有 {no_cnt} 次无Ig)" if no_cnt else ""
        print(f"  tH={tH:<4d}  {ig_str}{suffix}")

    # ── 只有 no_ig 的 tH ──
    print(f"\n🟠  仅无 Ig 字段的索引点 ({len(only_no_ig)} 个)  ← void 路径或未命中\n")
    print(f"  {'tH':<6}  {'出现次数'}")
    print(f"  {'─'*6}  {'─'*10}")
    for tH, cnt in only_no_ig:
        print(f"  tH={tH:<4d}  {cnt} 次")

    # ── 按 API 类别归纳 ──
    print(f"\n{'─'*68}")
    print("  📋  API 检测归纳（每个 tH 在检测什么）")
    print(f"{'─'*68}\n")

    # 已知含义映射（根据常见 hCaptcha 指纹逻辑）
    known = {
        "Window":                   "全局 window 对象",
        "Promise":                  "Promise 构造函数检测",
        "Object":                   "Object 原型检测",
        "Performance":              "performance API",
        "performance":              "window.performance 属性",
        "Crypto":                   "window.crypto API",
        "Uint8Array":               "TypedArray (crypto.getRandomValues)",
        "OfflineAudioContext":      "AudioContext 指纹",
        "RTCPeerConnection":        "WebRTC 检测",
        "fetch":                    "fetch API 检测",
        "Request":                  "fetch Request 构造函数",
        "Screen":                   "screen 对象",
        "Storage":                  "localStorage / sessionStorage",
        "IDBFactory":               "indexedDB",
        "HTMLDocument":             "document 类型",
        "HTMLCanvasElement":        "Canvas 元素检测",
        "CanvasRenderingContext2D": "2D Canvas 渲染上下文",
        "Navigator":                "navigator 对象",
        "webdriver":                "navigator.webdriver 检测（bot检测关键）",
        "languages":                "navigator.languages",
        "Array":                    "Array 类型检测",
        "getEntriesByType":         "performance.getEntriesByType 方法",
        "prototype":                "原型链检测",
        "constructor":              "constructor 属性验证",
        "__wdata":                  "window 属性枚举（环境指纹）",
        "#000000":                  "Canvas fillStyle 默认值",
    }

    for tH, igs, _ in has_value:
        descs = []
        for v in igs:
            if isinstance(v, str):
                d = known.get(v)
                if d:
                    descs.append(f"{v} → {d}")
                elif v.startswith("0,1,2,3"):
                    descs.append("window keys 枚举列表 → 全局属性指纹")
                elif re.match(r'\d+:\d+:\d{4}', v):
                    descs.append(f"{v} → HSW token 格式")
                elif v in ("f", "t", "c", "d"):
                    descs.append(f'"{v}" → 分支标记字符')
                else:
                    descs.append(v)
            elif isinstance(v, bool):
                descs.append(f"{v} → 布尔检测结果")
            elif isinstance(v, int):
                descs.append(f"{v} → 数值")

        print(f"  tH={tH:<4d}:")
        for d in descs:
            print(f"           {d}")
        print()


# ── 导出 JSON ────────────────────────────────────────────────
def export_json(tH_igs, tH_no_ig, out_path):
    result = OrderedDict()
    all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
    for tH in all_tH:
        igs = tH_igs.get(tH, [])
        no  = tH_no_ig.get(tH, 0)
        result[str(tH)] = {
            "ig_values": [v if not isinstance(v, str) or len(v) <= 200 else v[:200]+"..." for v in igs],
            "no_ig_count": no,
            "status": "has_value" if igs else "no_ig",
        }
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"📄 JSON 已写入: {out_path}")


# ── 入口 ─────────────────────────────────────────────────────
if __name__ == "__main__":
    log_path = find_log(sys.argv[1] if len(sys.argv) > 1 else None)
    print(f"📂 日志文件: {log_path}\n")

    entries = parse(log_path)
    print(f"共解析 {len(entries)} 条索引点记录\n")

    tH_igs, tH_no_ig, tH_lines = summarize(entries)
    report(tH_igs, tH_no_ig, tH_lines)

    out = log_path.replace(".log", "_analysis.json")
    export_json(tH_igs, tH_no_ig, out)