java/com/google/gerrit/mail/HtmlParser.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

// Copyright (C) 2016 The Android Open Source Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.gerrit.mail;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import com.google.gerrit.reviewdb.client.Comment;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/** Provides functionality for parsing the HTML part of a {@link MailMessage}. */
public class HtmlParser {

  private static final ImmutableSet<String> MAIL_PROVIDER_EXTRAS =
      ImmutableSet.of(
          "gmail_extra", // "On 01/01/2017 User<user@gmail.com> wrote:"
          "gmail_quote" // Used for quoting original content
          );

  private static final ImmutableSet<String> WHITELISTED_HTML_TAGS =
      ImmutableSet.of(
          "div", // Most user-typed comments are contained in a <div> tag
          "a", // We allow links to be contained in a comment
          "font" // Some email clients like nesting input in a new font tag
          );

  private HtmlParser() {}

  /**
   * Parses comments from html email.
   *
   * <p>This parser goes though all html elements in the email and checks for matching patterns. It
   * keeps track of the last file and comments it encountered to know in which context a parsed
   * comment belongs. It uses the href attributes of <a> tags to identify comments sent out by
   * Gerrit as these are generally more reliable then the text captions.
   *
   * @param email the message as received from the email service
   * @param comments a specific set of comments as sent out in the original notification email.
   *     Comments are expected to be in the same order as they were sent out to in the email.
   * @param changeUrl canonical change URL that points to the change on this Gerrit instance.
   *     Example: https://go-review.googlesource.com/#/c/91570
   * @return list of MailComments parsed from the html part of the email
   */
  public static List<MailComment> parse(
      MailMessage email, Collection<Comment> comments, String changeUrl) {
    // TODO(hiesel) Add support for Gmail Mobile
    // TODO(hiesel) Add tests for other popular email clients

    // This parser goes though all html elements in the email and checks for
    // matching patterns. It keeps track of the last file and comments it
    // encountered to know in which context a parsed comment belongs.
    // It uses the href attributes of <a> tags to identify comments sent out by
    // Gerrit as these are generally more reliable then the text captions.
    List<MailComment> parsedComments = new ArrayList<>();
    Document d = Jsoup.parse(email.htmlContent());
    PeekingIterator<Comment> iter = Iterators.peekingIterator(comments.iterator());

    String lastEncounteredFileName = null;
    Comment lastEncounteredComment = null;
    for (Element e : d.body().getAllElements()) {
      String elementName = e.tagName();
      boolean isInBlockQuote =
          e.parents()
              .stream()
              .anyMatch(
                  p ->
                      p.tagName().equals("blockquote")
                          || MAIL_PROVIDER_EXTRAS.contains(p.className()));

      if (elementName.equals("a")) {
        String href = e.attr("href");
        // Check if there is still a next comment that could be contained in
        // this <a> tag
        if (!iter.hasNext()) {
          continue;
        }
        Comment perspectiveComment = iter.peek();
        if (href.equals(ParserUtil.filePath(changeUrl, perspectiveComment))) {
          if (lastEncounteredFileName == null
              || !lastEncounteredFileName.equals(perspectiveComment.key.filename)) {
            // Not a file-level comment, but users could have typed a comment
            // right after this file annotation to create a new file-level
            // comment. If this file has a file-level comment, we have already
            // set lastEncounteredComment to that file-level comment when we
            // encountered the file link and should not reset it now.
            lastEncounteredFileName = perspectiveComment.key.filename;
            lastEncounteredComment = null;
          } else if (perspectiveComment.lineNbr == 0) {
            // This was originally a file-level comment
            lastEncounteredComment = perspectiveComment;
            iter.next();
          }
          continue;
        } else if (ParserUtil.isCommentUrl(href, changeUrl, perspectiveComment)) {
          // This is a regular inline comment
          lastEncounteredComment = perspectiveComment;
          iter.next();
          continue;
        }
      }

      if (isInBlockQuote) {
        // There is no user-input in quoted text
        continue;
      }
      if (!WHITELISTED_HTML_TAGS.contains(elementName)) {
        // We only accept a set of whitelisted tags that can contain user input
        continue;
      }
      if (elementName.equals("a") && e.attr("href").startsWith("mailto:")) {
        // We don't accept mailto: links in general as they often appear in reply-to lines
        // (User<user@gmail.com> wrote: ...)
        continue;
      }

      // This is a comment typed by the user
      // Replace non-breaking spaces and trim string
      String content = e.ownText().replace('\u00a0', ' ').trim();
      boolean isLink = elementName.equals("a");
      if (!Strings.isNullOrEmpty(content)) {
        if (lastEncounteredComment == null && lastEncounteredFileName == null) {
          // Remove quotation line, email signature and
          // "Sent from my xyz device"
          content = ParserUtil.trimQuotation(content);
          // TODO(hiesel) Add more sanitizer
          if (!Strings.isNullOrEmpty(content)) {
            ParserUtil.appendOrAddNewComment(
                new MailComment(
                    content, null, null, MailComment.CommentType.CHANGE_MESSAGE, isLink),
                parsedComments);
          }
        } else if (lastEncounteredComment == null) {
          ParserUtil.appendOrAddNewComment(
              new MailComment(
                  content,
                  lastEncounteredFileName,
                  null,
                  MailComment.CommentType.FILE_COMMENT,
                  isLink),
              parsedComments);
        } else {
          ParserUtil.appendOrAddNewComment(
              new MailComment(
                  content,
                  null,
                  lastEncounteredComment,
                  MailComment.CommentType.INLINE_COMMENT,
                  isLink),
              parsedComments);
        }
      }
    }
    return parsedComments;
  }
}