|
1 | 1 | package top.howiehz.halo.plugin.extra.api.finder; |
2 | 2 |
|
3 | 3 | import java.util.*; |
4 | | -import java.util.regex.Pattern; |
5 | | -import org.springframework.data.domain.Sort; |
| 4 | +import java.math.BigInteger; |
6 | 5 | import org.springframework.stereotype.Component; |
7 | 6 | import reactor.core.publisher.Mono; |
8 | | -import run.halo.app.content.ContentWrapper; |
9 | | -import run.halo.app.content.PostContentService; |
10 | | -import run.halo.app.core.extension.content.Post; |
11 | | -import run.halo.app.extension.ListOptions; |
12 | | -import run.halo.app.extension.ReactiveExtensionClient; |
13 | 7 | import run.halo.app.theme.finders.Finder; |
| 8 | +import top.howiehz.halo.plugin.extra.api.service.PostWordCountService; |
14 | 9 |
|
15 | 10 | /** |
16 | 11 | * Implementation of ExtraApiStatsFinder. |
| 12 | + * 统计 Finder 的实现,用于为主题提供字数统计能力。 |
17 | 13 | */ |
18 | 14 | @Component |
19 | 15 | @Finder("extraApiStatsFinder") |
20 | 16 | public class ExtraApiStatsFinderImpl implements ExtraApiStatsFinder { |
21 | 17 |
|
22 | | - private final ReactiveExtensionClient client; // 响应式扩展客户端 / Reactive extension client |
23 | | - private final PostContentService postContentService; // 文章内容服务 / Post content service |
| 18 | + private final PostWordCountService postWordCountService; |
24 | 19 |
|
25 | 20 | /** |
26 | 21 | * Constructor to initialize ExtraApiStatsFinderImpl with required dependencies. |
27 | | - * 构造函数,使用必需的依赖项初始化 ExtraApiStatsFinderImpl。 |
| 22 | + * 构造函数,注入所需依赖。 |
28 | 23 | */ |
29 | | - public ExtraApiStatsFinderImpl(ReactiveExtensionClient client, |
30 | | - PostContentService postContentService) { |
31 | | - this.client = client; // 注入响应式扩展客户端 / Inject reactive extension client |
32 | | - this.postContentService = postContentService; // 注入文章内容服务 / Inject post content service |
33 | | - } |
34 | | - |
35 | | - /** |
36 | | - * Get the word count of content by post name and method name. |
37 | | - * 根据文章名称和方法名获取内容的字数统计。 |
38 | | - * |
39 | | - * @param name the post name / 文章名称 |
40 | | - * @param methodName the method name to invoke / 要调用的方法名 |
41 | | - * @return word count as Mono / 返回字数统计的 Mono |
42 | | - */ |
43 | | - private Mono<Integer> postContentCountByName(String name, String methodName) { |
44 | | - if (name == null || name.isBlank()) { |
45 | | - return Mono.just(0); // 空名称直接返回0 / Return 0 for empty name |
46 | | - } |
47 | | - |
48 | | - // 使用函数接口映射方法名到对应的服务调用 / Use function interface to map method name to service call |
49 | | - Mono<ContentWrapper> contentMono = switch (methodName) { |
50 | | - case "getHeadContent" -> postContentService.getHeadContent(name); |
51 | | - case "getReleaseContent" -> postContentService.getReleaseContent(name); |
52 | | - default -> Mono.empty(); // 不支持的方法名返回空 / Return empty for unsupported method |
53 | | - }; |
54 | | - |
55 | | - return contentMono.map(ContentWrapper::getContent) // 提取 content 字段 / Extract content field |
56 | | - .map(content -> countWords( |
57 | | - extractText(content))) // 从 HTML 提取文本并计数 / Extract text and count |
58 | | - .onErrorReturn(0) // 出错时返回 0 / Return 0 on error |
59 | | - .defaultIfEmpty(0); // 空结果时返回 0 / Return 0 if empty |
60 | | - } |
61 | | - |
62 | | - // Patterns for stripping HTML quickly |
63 | | - // 快速移除HTML的正则表达式模式 / Patterns for quickly stripping HTML |
64 | | - private static final Pattern HTML_CONTENT_REMOVAL = |
65 | | - Pattern.compile("(?is)<(?:script|style)\\b[^>]*>.*?</(?:script|style)>|<[^>]+>|&[a-zA-Z0-9#]+;"); |
66 | | - |
67 | | - /** |
68 | | - * Extract plain text from HTML content by removing tags and entities. |
69 | | - * Removes script and style tags, HTML tags, and normalizes whitespace. |
70 | | - * 从 HTML 内容中提取纯文本,移除标签和实体。 |
71 | | - * 移除 script 和 style 标签、HTML 标签,并规范化空白字符。 |
72 | | - * |
73 | | - * @param html the HTML content / HTML 内容 |
74 | | - * @return plain text / 返回纯文本 |
75 | | - */ |
76 | | - static String extractText(String html) { |
77 | | - if (html == null || html.isBlank()) { |
78 | | - return ""; |
79 | | - } |
80 | | - |
81 | | - // 一次性处理所有HTML标签和实体 |
82 | | - return HTML_CONTENT_REMOVAL.matcher(html).replaceAll(" "); |
83 | | - } |
84 | | - |
85 | | - /** |
86 | | - * Count words in text, supporting both CJK characters and ASCII words. |
87 | | - * CJK characters are counted individually, ASCII letters/digits are grouped as words. |
88 | | - * 统计文本中的词数,支持中日韩字符和 ASCII 单词。 |
89 | | - * 中日韩字符单独计数,ASCII 字母/数字按单词分组计数。 |
90 | | - * |
91 | | - * @param text the input text / 输入文本 |
92 | | - * @return word count / 返回词数统计 |
93 | | - */ |
94 | | - static int countWords(String text) { |
95 | | - if (text == null || text.isEmpty()) { |
96 | | - return 0; |
97 | | - } |
98 | | - int count = 0; // 字数计数器 / Word count counter |
99 | | - boolean inAsciiWord = false; // 是否在ASCII单词中 / Whether in an ASCII word |
100 | | - int length = text.length(); |
101 | | - for (int i = 0; i < length; ) { |
102 | | - int codePoint = text.codePointAt(i); // 获取当前字符的码点 / Get code point of current character |
103 | | - if (isCJK(codePoint)) { |
104 | | - // count each CJK code point as one word/char |
105 | | - // 每个CJK码点计为一个字/词 / Count each CJK code point as one word/char |
106 | | - count++; |
107 | | - inAsciiWord = false; // 重置ASCII单词状态 / Reset ASCII word state |
108 | | - } else if (Character.isLetterOrDigit(codePoint)) { |
109 | | - // group consecutive ASCII letters/digits as one word |
110 | | - // 连续的ASCII字母/数字作为一个单词 / Group consecutive ASCII letters/digits as one word |
111 | | - if (!inAsciiWord) { |
112 | | - count++; // 开始新的ASCII单词 / Start a new ASCII word |
113 | | - inAsciiWord = true; // 设置在ASCII单词中 / Set in ASCII word |
114 | | - } |
115 | | - } else { |
116 | | - inAsciiWord = false; // 非字母数字字符,重置状态 / Non-alphanumeric character, reset state |
117 | | - } |
118 | | - // 使用位运算优化字符长度计算 / Optimize character length calculation with bitwise operations |
119 | | - i += (codePoint <= 0xFFFF) ? 1 : 2; |
120 | | - } |
121 | | - return Math.max(count, 0); // 确保返回非负数 / Ensure non-negative result |
122 | | - } |
123 | | - |
124 | | - |
125 | | - /** |
126 | | - * Check if a Unicode code point belongs to CJK (Chinese, Japanese, Korean) character blocks. |
127 | | - * Includes various CJK unified ideographs, compatibility ideographs, and phonetic extensions. |
128 | | - * 检查 Unicode 码点是否属于中日韩 (CJK) 字符块。 |
129 | | - * 包括各种 CJK 统一表意文字、兼容表意文字和音标扩展。 |
130 | | - * Optimized CJK character detection using range checks. |
131 | | - * 使用范围检查优化的CJK字符检测。 |
132 | | - */ |
133 | | - private static boolean isCJK(int codePoint) { |
134 | | - // 常见CJK范围的快速检查 / Fast check for common CJK ranges |
135 | | - return (codePoint >= 0x4E00 && codePoint <= 0x9FFF) || // CJK Unified Ideographs |
136 | | - (codePoint >= 0x3400 && codePoint <= 0x4DBF) || // CJK Extension A |
137 | | - (codePoint >= 0x20000 && codePoint <= 0x2A6DF) || // CJK Extension B |
138 | | - (codePoint >= 0x2A700 && codePoint <= 0x2B73F) || // CJK Extension C |
139 | | - (codePoint >= 0x2B740 && codePoint <= 0x2B81F) || // CJK Extension D |
140 | | - (codePoint >= 0x2B820 && codePoint <= 0x2CEAF) || // CJK Extension E |
141 | | - (codePoint >= 0x2CEB0 && codePoint <= 0x2EBEF) || // CJK Extension F |
142 | | - (codePoint >= 0xF900 && codePoint <= 0xFAFF) || // CJK Compatibility Ideographs |
143 | | - (codePoint >= 0x2F800 && codePoint <= 0x2FA1F) || |
144 | | - // CJK Compatibility Ideographs Supplement |
145 | | - (codePoint >= 0x3040 && codePoint <= 0x309F) || // Hiragana |
146 | | - (codePoint >= 0x30A0 && codePoint <= 0x30FF) || // Katakana |
147 | | - (codePoint >= 0x31F0 && codePoint <= 0x31FF) || // Katakana Phonetic Extensions |
148 | | - (codePoint >= 0xAC00 && codePoint <= 0xD7AF) || // Hangul Syllables |
149 | | - (codePoint >= 0x1100 && codePoint <= 0x11FF) || // Hangul Jamo |
150 | | - (codePoint >= 0x3130 && codePoint <= 0x318F); // Hangul Compatibility Jamo |
| 24 | + public ExtraApiStatsFinderImpl(PostWordCountService postWordCountService) { |
| 25 | + this.postWordCountService = postWordCountService; |
151 | 26 | } |
152 | 27 |
|
153 | 28 | /** |
154 | 29 | * Unified word count API without slug support. |
155 | 30 | * If name provided, count by name; otherwise sum word counts across all posts |
156 | 31 | * (release/draft selectable by version). |
157 | | - * 统一的字数统计API,不支持slug。 |
158 | | - * 如果提供name参数则统计指定文章,否则统计所有文章的字数总和。 |
| 32 | + * 统一的字数统计 API |
| 33 | + * 若提供 name 参数则按名称统计,否则统计所有文章的总字数(version 可选 release/draft)。 |
159 | 34 | * |
160 | | - * @param params parameter map: name? version? ('release'|'draft', default 'release') |
161 | | - * @return word count as Mono (non-negative) |
| 35 | + * @param params parameter map: name? version? ('release'|'draft', default 'release') / |
| 36 | + * 参数映射:name?version?('release' 或 'draft',默认 'release') |
| 37 | + * @return word count as Mono (non-negative) / 返回字数(非负)的 Mono |
162 | 38 | */ |
163 | 39 | @Override |
164 | | - public Mono<Integer> wordCount(Map<String, Object> params) { |
| 40 | + public Mono<BigInteger> postWordCount(Map<String, Object> params) { |
165 | 41 | Map<String, Object> map = params == null ? java.util.Collections.emptyMap() : params; |
166 | | - String name = String.valueOf(map.get("name")); |
| 42 | + String postName = String.valueOf(map.get("name")); |
167 | 43 | boolean isDraft = |
168 | 44 | String.valueOf(map.getOrDefault("version", "release")).equalsIgnoreCase("draft"); |
169 | 45 |
|
170 | | - if ("null".equals(name) || name.isBlank()) { |
171 | | - return sumWordCountsAcrossAllPosts(isDraft); |
| 46 | + if ("null".equals(postName) || postName.isBlank()) { |
| 47 | + return postWordCountService.getTotalPostWordCount(isDraft); |
172 | 48 | } |
173 | 49 |
|
174 | | - return isDraft ? postContentCountByName(name, "getHeadContent") |
175 | | - : postContentCountByName(name, "getReleaseContent"); |
176 | | - } |
177 | | - |
178 | | - /** |
179 | | - * sum word counts across all posts with pagination. |
180 | | - * 统计所有文章的字数总和。 |
181 | | - */ |
182 | | - private Mono<Integer> sumWordCountsAcrossAllPosts(boolean isDraft) { |
183 | | - return client.listAll(Post.class, ListOptions.builder().build(), Sort.unsorted()) |
184 | | - .map(post -> post.getMetadata().getName()) // 提取需要的名称 |
185 | | - .flatMapSequential(postName -> isDraft ? postContentCountByName(postName, "getHeadContent") |
186 | | - : postContentCountByName(postName, "getReleaseContent"), 1024) // 1024 并发 |
187 | | - .reduce(0, Integer::sum) // 直接累加 |
188 | | - .onErrorReturn(0); |
| 50 | + return postWordCountService.getPostWordCount(postName, isDraft); |
189 | 51 | } |
190 | 52 | } |
0 commit comments