This commit is contained in:
dela
2026-02-21 18:27:49 +08:00
parent 0ac4b23f07
commit 5dc86ccfbf
270 changed files with 49508 additions and 4636 deletions

213
analyze_new.py Normal file
View File

@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
专用解析脚本chatgpt.com-*.log 格式
每行结构: hsw.js:2 {"tag":"索引点","tH":N,"Ig":"..."}
Ig 值含义:被检测的浏览器 API 构造函数名 / 属性名 / 返回值
"""
import re
import json
import sys
import glob
from collections import defaultdict, OrderedDict
# ── 自动找日志文件 ──────────────────────────────────────────
def find_log(path_arg=None):
if path_arg:
return path_arg
candidates = sorted(glob.glob("/home/carry/myprj/hcaptcha/asset/chatgpt.com-*.log"))
if not candidates:
print("❌ 未找到 chatgpt.com-*.log请手动传入路径")
sys.exit(1)
return candidates[-1] # 取最新的
# ── 解析 ────────────────────────────────────────────────────
def parse(path):
entries = []
with open(path, encoding="utf-8") as f:
for lineno, line in enumerate(f, 1):
line = line.strip()
m = re.match(r'hsw\.js:\d+\s+(.*)', line)
if not m:
continue
body = m.group(1).strip()
if body.startswith('{'):
try:
obj = json.loads(body)
if obj.get("tag") == "索引点":
entries.append({
"lineno": lineno,
"tH": obj["tH"],
"has_ig": "Ig" in obj,
"ig": obj.get("Ig"), # 可能是 str/int/bool/None
})
except json.JSONDecodeError:
pass
return entries
# ── 汇总 ────────────────────────────────────────────────────
def summarize(entries):
"""
对每个 tH按出现顺序收集所有 Ig 值(去重保序)。
分类:
- has_value : Ig 有实际内容
- no_ig : 完全没有 Ig 字段
"""
tH_igs = defaultdict(list) # tH -> [ig, ...](有序去重后)
tH_no_ig = defaultdict(int) # tH -> 出现次数(无 Ig 的)
tH_lines = defaultdict(list) # tH -> 首次出现行号
seen = defaultdict(set) # 用于 Ig 去重
for e in entries:
tH = e["tH"]
tH_lines[tH].append(e["lineno"])
if e["has_ig"]:
ig = e["ig"]
key = repr(ig)
if key not in seen[tH]:
seen[tH].add(key)
tH_igs[tH].append(ig)
else:
tH_no_ig[tH] += 1
return tH_igs, tH_no_ig, tH_lines
# ── 打印报告 ─────────────────────────────────────────────────
def report(tH_igs, tH_no_ig, tH_lines):
all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
print("=" * 68)
print(" HSW 新日志分析 — 每个索引点(tH)访问的浏览器 API")
print("=" * 68)
# 分组输出
has_value = []
only_no_ig = []
for tH in all_tH:
igs = tH_igs.get(tH, [])
no = tH_no_ig.get(tH, 0)
if igs:
has_value.append((tH, igs, no))
else:
only_no_ig.append((tH, no))
# ── 有值的 tH ──
print(f"\n✅ 有 Ig 值的索引点 ({len(has_value)} 个)\n")
print(f" {'tH':<6} {'Ig 值(去重、按出现顺序)'}")
print(f" {''*6} {''*56}")
for tH, igs, no_cnt in has_value:
# 格式化 Ig 列表
parts = []
for v in igs:
if isinstance(v, str) and len(v) > 60:
parts.append(v[:57] + "...")
else:
parts.append(repr(v) if not isinstance(v, str) else v)
ig_str = " | ".join(parts)
suffix = f" (另有 {no_cnt} 次无Ig)" if no_cnt else ""
print(f" tH={tH:<4d} {ig_str}{suffix}")
# ── 只有 no_ig 的 tH ──
print(f"\n🟠 仅无 Ig 字段的索引点 ({len(only_no_ig)} 个) ← void 路径或未命中\n")
print(f" {'tH':<6} {'出现次数'}")
print(f" {''*6} {''*10}")
for tH, cnt in only_no_ig:
print(f" tH={tH:<4d} {cnt}")
# ── 按 API 类别归纳 ──
print(f"\n{''*68}")
print(" 📋 API 检测归纳(每个 tH 在检测什么)")
print(f"{''*68}\n")
# 已知含义映射(根据常见 hCaptcha 指纹逻辑)
known = {
"Window": "全局 window 对象",
"Promise": "Promise 构造函数检测",
"Object": "Object 原型检测",
"Performance": "performance API",
"performance": "window.performance 属性",
"Crypto": "window.crypto API",
"Uint8Array": "TypedArray (crypto.getRandomValues)",
"OfflineAudioContext": "AudioContext 指纹",
"RTCPeerConnection": "WebRTC 检测",
"fetch": "fetch API 检测",
"Request": "fetch Request 构造函数",
"Screen": "screen 对象",
"Storage": "localStorage / sessionStorage",
"IDBFactory": "indexedDB",
"HTMLDocument": "document 类型",
"HTMLCanvasElement": "Canvas 元素检测",
"CanvasRenderingContext2D": "2D Canvas 渲染上下文",
"Navigator": "navigator 对象",
"webdriver": "navigator.webdriver 检测bot检测关键",
"languages": "navigator.languages",
"Array": "Array 类型检测",
"getEntriesByType": "performance.getEntriesByType 方法",
"prototype": "原型链检测",
"constructor": "constructor 属性验证",
"__wdata": "window 属性枚举(环境指纹)",
"#000000": "Canvas fillStyle 默认值",
}
for tH, igs, _ in has_value:
descs = []
for v in igs:
if isinstance(v, str):
d = known.get(v)
if d:
descs.append(f"{v}{d}")
elif v.startswith("0,1,2,3"):
descs.append("window keys 枚举列表 → 全局属性指纹")
elif re.match(r'\d+:\d+:\d{4}', v):
descs.append(f"{v} → HSW token 格式")
elif v in ("f", "t", "c", "d"):
descs.append(f'"{v}" → 分支标记字符')
else:
descs.append(v)
elif isinstance(v, bool):
descs.append(f"{v} → 布尔检测结果")
elif isinstance(v, int):
descs.append(f"{v} → 数值")
print(f" tH={tH:<4d}:")
for d in descs:
print(f" {d}")
print()
# ── 导出 JSON ────────────────────────────────────────────────
def export_json(tH_igs, tH_no_ig, out_path):
result = OrderedDict()
all_tH = sorted(set(list(tH_igs.keys()) + list(tH_no_ig.keys())))
for tH in all_tH:
igs = tH_igs.get(tH, [])
no = tH_no_ig.get(tH, 0)
result[str(tH)] = {
"ig_values": [v if not isinstance(v, str) or len(v) <= 200 else v[:200]+"..." for v in igs],
"no_ig_count": no,
"status": "has_value" if igs else "no_ig",
}
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"📄 JSON 已写入: {out_path}")
# ── 入口 ─────────────────────────────────────────────────────
if __name__ == "__main__":
log_path = find_log(sys.argv[1] if len(sys.argv) > 1 else None)
print(f"📂 日志文件: {log_path}\n")
entries = parse(log_path)
print(f"共解析 {len(entries)} 条索引点记录\n")
tH_igs, tH_no_ig, tH_lines = summarize(entries)
report(tH_igs, tH_no_ig, tH_lines)
out = log_path.replace(".log", "_analysis.json")
export_json(tH_igs, tH_no_ig, out)