Line data Source code
1 : /*
2 : * Famedly Matrix SDK
3 : * Copyright (C) 2021 Famedly GmbH
4 : *
5 : * This program is free software: you can redistribute it and/or modify
6 : * it under the terms of the GNU Affero General Public License as
7 : * published by the Free Software Foundation, either version 3 of the
8 : * License, or (at your option) any later version.
9 : *
10 : * This program is distributed in the hope that it will be useful,
11 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : * GNU Affero General Public License for more details.
14 : *
15 : * You should have received a copy of the GNU Affero General Public License
16 : * along with this program. If not, see <https://www.gnu.org/licenses/>.
17 : */
18 :
19 : import 'package:collection/collection.dart';
20 : import 'package:html/dom.dart';
21 : import 'package:html/parser.dart';
22 : import 'package:html_unescape/html_unescape.dart';
23 :
24 : class HtmlToText {
25 : /// Convert an HTML string to a pseudo-markdown plain text representation, with
26 : /// `data-mx-spoiler` spans redacted
27 3 : static String convert(String html) {
28 : // riot-web is notorious for creating bad reply fallback events from invalid messages which, if
29 : // not handled properly, can lead to impersonation. As such, we strip the entire `<mx-reply>` tags
30 : // here already, to prevent that from happening.
31 : // We do *not* do this in an AST and just with simple regex here, as riot-web tends to create
32 : // miss-matching tags, and this way we actually correctly identify what we want to strip and, well,
33 : // strip it.
34 3 : final renderHtml = html.replaceAll(
35 3 : RegExp('<mx-reply>.*</mx-reply>',
36 : caseSensitive: false, multiLine: false, dotAll: true),
37 : '');
38 :
39 3 : final opts = _ConvertOpts();
40 6 : var reply = _walkNode(opts, parseFragment(renderHtml));
41 6 : reply = reply.replaceAll(RegExp(r'\s*$', multiLine: false), '');
42 : return reply;
43 : }
44 :
45 2 : static String _parsePreContent(_ConvertOpts opts, Element node) {
46 2 : var text = node.innerHtml;
47 : final match =
48 2 : RegExp(r'^<code([^>]*)>', multiLine: false, caseSensitive: false)
49 2 : .firstMatch(text);
50 : if (match == null) {
51 4 : text = HtmlUnescape().convert(text);
52 2 : if (text.isNotEmpty) {
53 4 : if (text[0] != '\n') {
54 2 : text = '\n$text';
55 : }
56 8 : if (text[text.length - 1] != '\n') {
57 2 : text += '\n';
58 : }
59 : }
60 : return text;
61 : }
62 : // remove <code> opening tag
63 4 : text = text.substring(match.end);
64 : // remove the </code> closing tag
65 2 : text = text.replaceAll(
66 2 : RegExp(r'</code>$', multiLine: false, caseSensitive: false), '');
67 4 : text = HtmlUnescape().convert(text);
68 2 : if (text.isNotEmpty) {
69 4 : if (text[0] != '\n') {
70 2 : text = '\n$text';
71 : }
72 8 : if (text[text.length - 1] != '\n') {
73 2 : text += '\n';
74 : }
75 : }
76 : final language =
77 2 : RegExp(r'language-(\w+)', multiLine: false, caseSensitive: false)
78 4 : .firstMatch(match.group(1)!);
79 : if (language != null) {
80 4 : text = language.group(1)! + text;
81 : }
82 : return text;
83 : }
84 :
85 2 : static String _parseBlockquoteContent(_ConvertOpts opts, Element node) {
86 2 : final msg = _walkChildNodes(opts, node);
87 12 : return '${msg.split('\n').map((s) => '> $s').join('\n')}\n';
88 : }
89 :
90 2 : static String _parseSpanContent(_ConvertOpts opts, Element node) {
91 2 : final content = _walkChildNodes(opts, node);
92 6 : if (node.attributes['data-mx-spoiler'] is String) {
93 4 : var spoiler = '█' * content.length;
94 4 : final reason = node.attributes['data-mx-spoiler'];
95 2 : if (reason != '') {
96 2 : spoiler = '($reason) $spoiler';
97 : }
98 : return spoiler;
99 : }
100 : return content;
101 : }
102 :
103 2 : static String _parseUlContent(_ConvertOpts opts, Element node) {
104 4 : opts.listDepth++;
105 4 : final entries = _listChildNodes(opts, node, {'li'});
106 4 : opts.listDepth--;
107 : final bulletPoint =
108 8 : _listBulletPoints[opts.listDepth % _listBulletPoints.length];
109 :
110 : return entries
111 4 : .map((s) =>
112 14 : '${' ' * opts.listDepth}$bulletPoint ${s.replaceAll('\n', '\n${' ' * opts.listDepth} ')}')
113 2 : .join('\n');
114 : }
115 :
116 2 : static String _parseOlContent(_ConvertOpts opts, Element node) {
117 4 : opts.listDepth++;
118 4 : final entries = _listChildNodes(opts, node, {'li'});
119 4 : opts.listDepth--;
120 4 : final startStr = node.attributes['start'];
121 2 : final start = (startStr is String &&
122 4 : RegExp(r'^[0-9]+$', multiLine: false).hasMatch(startStr))
123 2 : ? int.parse(startStr)
124 : : 1;
125 :
126 : return entries
127 4 : .mapIndexed((index, s) =>
128 16 : '${' ' * opts.listDepth}${start + index}. ${s.replaceAll('\n', '\n${' ' * opts.listDepth} ')}')
129 2 : .join('\n');
130 : }
131 :
132 : static const _listBulletPoints = <String>['●', '○', '■', '‣'];
133 :
134 2 : static List<String> _listChildNodes(_ConvertOpts opts, Element node,
135 : [Iterable<String>? types]) {
136 2 : final replies = <String>[];
137 4 : for (final child in node.nodes) {
138 : if (types != null &&
139 2 : types.isNotEmpty &&
140 2 : ((child is Text) ||
141 2 : ((child is Element) &&
142 6 : !types.contains(child.localName!.toLowerCase())))) {
143 : continue;
144 : }
145 4 : replies.add(_walkNode(opts, child));
146 : }
147 : return replies;
148 : }
149 :
150 : static const _blockTags = <String>{
151 : 'blockquote',
152 : 'ul',
153 : 'ol',
154 : 'h1',
155 : 'h2',
156 : 'h3',
157 : 'h4',
158 : 'h5',
159 : 'h6',
160 : 'pre',
161 : };
162 :
163 3 : static String _walkChildNodes(_ConvertOpts opts, Node node) {
164 : var reply = '';
165 : var lastTag = '';
166 6 : for (final child in node.nodes) {
167 9 : final thisTag = child is Element ? child.localName!.toLowerCase() : '';
168 6 : if (thisTag == 'p' && lastTag == 'p') {
169 2 : reply += '\n\n';
170 3 : } else if (_blockTags.contains(thisTag) &&
171 3 : reply.isNotEmpty &&
172 8 : reply[reply.length - 1] != '\n') {
173 2 : reply += '\n';
174 : }
175 6 : reply += _walkNode(opts, child);
176 3 : if (thisTag.isNotEmpty) {
177 : lastTag = thisTag;
178 : }
179 : }
180 : return reply;
181 : }
182 :
183 3 : static String _walkNode(_ConvertOpts opts, Node node) {
184 3 : if (node is Text) {
185 : // ignore \n between single nodes
186 9 : return node.text == '\n' ? '' : node.text;
187 3 : } else if (node is Element) {
188 6 : final tag = node.localName!.toLowerCase();
189 : switch (tag) {
190 3 : case 'em':
191 3 : case 'i':
192 6 : return '*${_walkChildNodes(opts, node)}*';
193 3 : case 'strong':
194 3 : case 'b':
195 6 : return '**${_walkChildNodes(opts, node)}**';
196 3 : case 'u':
197 3 : case 'ins':
198 4 : return '__${_walkChildNodes(opts, node)}__';
199 3 : case 'del':
200 3 : case 'strike':
201 3 : case 's':
202 4 : return '~~${_walkChildNodes(opts, node)}~~';
203 3 : case 'code':
204 4 : return '`${node.text}`';
205 3 : case 'pre':
206 4 : return '```${_parsePreContent(opts, node)}```\n';
207 3 : case 'a':
208 6 : final href = node.attributes['href'] ?? '';
209 3 : final content = _walkChildNodes(opts, node);
210 6 : if (href.toLowerCase().startsWith('https://matrix.to/#/') ||
211 6 : href.toLowerCase().startsWith('matrix:')) {
212 : return content;
213 : }
214 3 : return '🔗$content';
215 3 : case 'img':
216 4 : return node.attributes['alt'] ??
217 0 : node.attributes['title'] ??
218 0 : node.attributes['src'] ??
219 : '';
220 3 : case 'br':
221 : return '\n';
222 3 : case 'blockquote':
223 2 : return _parseBlockquoteContent(opts, node);
224 3 : case 'ul':
225 2 : return _parseUlContent(opts, node);
226 3 : case 'ol':
227 2 : return _parseOlContent(opts, node);
228 3 : case 'mx-reply':
229 : return '';
230 3 : case 'hr':
231 : return '\n----------\n';
232 3 : case 'h1':
233 3 : case 'h2':
234 3 : case 'h3':
235 3 : case 'h4':
236 3 : case 'h5':
237 3 : case 'h6':
238 9 : final mark = '#' * int.parse(tag[1]);
239 6 : return '$mark ${_walkChildNodes(opts, node)}\n';
240 3 : case 'span':
241 2 : return _parseSpanContent(opts, node);
242 : default:
243 3 : return _walkChildNodes(opts, node);
244 : }
245 : } else {
246 3 : return _walkChildNodes(opts, node);
247 : }
248 : }
249 : }
250 :
251 : class _ConvertOpts {
252 : int listDepth = 0;
253 : }
|