encodedword.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. package parsemail
  2. // copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.4:src/mime/encodedword.go
  3. // Golang官方库的解码函数不支持中文编码,此处实现支持了中文gbk和gb18030编码
  4. import (
  5. "bytes"
  6. "encoding/base64"
  7. "errors"
  8. "fmt"
  9. "golang.org/x/text/encoding/simplifiedchinese"
  10. "io"
  11. "strings"
  12. "unicode"
  13. "unicode/utf8"
  14. )
  15. // A WordEncoder is an RFC 2047 encoded-word encoder.
  16. type WordEncoder byte
  17. const (
  18. // BEncoding represents Base64 encoding scheme as defined by RFC 2045.
  19. BEncoding = WordEncoder('b')
  20. // QEncoding represents the Q-encoding scheme as defined by RFC 2047.
  21. QEncoding = WordEncoder('q')
  22. )
  23. var (
  24. errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
  25. )
  26. // Encode returns the encoded-word form of s. If s is ASCII without special
  27. // characters, it is returned unchanged. The provided charset is the IANA
  28. // charset name of s. It is case insensitive.
  29. func (e WordEncoder) Encode(charset, s string) string {
  30. if !needsEncoding(s) {
  31. return s
  32. }
  33. return e.encodeWord(charset, s)
  34. }
  35. func needsEncoding(s string) bool {
  36. for _, b := range s {
  37. if (b < ' ' || b > '~') && b != '\t' {
  38. return true
  39. }
  40. }
  41. return false
  42. }
  43. // encodeWord encodes a string into an encoded-word.
  44. func (e WordEncoder) encodeWord(charset, s string) string {
  45. var buf strings.Builder
  46. // Could use a hint like len(s)*3, but that's not enough for cases
  47. // with word splits and too much for simpler inputs.
  48. // 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
  49. buf.Grow(48)
  50. e.openWord(&buf, charset)
  51. if e == BEncoding {
  52. e.bEncode(&buf, charset, s)
  53. } else {
  54. e.qEncode(&buf, charset, s)
  55. }
  56. closeWord(&buf)
  57. return buf.String()
  58. }
  59. const (
  60. // The maximum length of an encoded-word is 75 characters.
  61. // See RFC 2047, section 2.
  62. maxEncodedWordLen = 75
  63. // maxContentLen is how much content can be encoded, ignoring the header and
  64. // 2-byte footer.
  65. maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
  66. )
  67. var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
  68. // bEncode encodes s using base64 encoding and writes it to buf.
  69. func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
  70. w := base64.NewEncoder(base64.StdEncoding, buf)
  71. // If the charset is not UTF-8 or if the content is short, do not bother
  72. // splitting the encoded-word.
  73. if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
  74. io.WriteString(w, s)
  75. w.Close()
  76. return
  77. }
  78. var currentLen, last, runeLen int
  79. for i := 0; i < len(s); i += runeLen {
  80. // Multi-byte characters must not be split across encoded-words.
  81. // See RFC 2047, section 5.3.
  82. _, runeLen = utf8.DecodeRuneInString(s[i:])
  83. if currentLen+runeLen <= maxBase64Len {
  84. currentLen += runeLen
  85. } else {
  86. io.WriteString(w, s[last:i])
  87. w.Close()
  88. e.splitWord(buf, charset)
  89. last = i
  90. currentLen = runeLen
  91. }
  92. }
  93. io.WriteString(w, s[last:])
  94. w.Close()
  95. }
  96. // qEncode encodes s using Q encoding and writes it to buf. It splits the
  97. // encoded-words when necessary.
  98. func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
  99. // We only split encoded-words when the charset is UTF-8.
  100. if !isUTF8(charset) {
  101. writeQString(buf, s)
  102. return
  103. }
  104. var currentLen, runeLen int
  105. for i := 0; i < len(s); i += runeLen {
  106. b := s[i]
  107. // Multi-byte characters must not be split across encoded-words.
  108. // See RFC 2047, section 5.3.
  109. var encLen int
  110. if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
  111. runeLen, encLen = 1, 1
  112. } else {
  113. _, runeLen = utf8.DecodeRuneInString(s[i:])
  114. encLen = 3 * runeLen
  115. }
  116. if currentLen+encLen > maxContentLen {
  117. e.splitWord(buf, charset)
  118. currentLen = 0
  119. }
  120. writeQString(buf, s[i:i+runeLen])
  121. currentLen += encLen
  122. }
  123. }
  124. // writeQString encodes s using Q encoding and writes it to buf.
  125. func writeQString(buf *strings.Builder, s string) {
  126. for i := 0; i < len(s); i++ {
  127. switch b := s[i]; {
  128. case b == ' ':
  129. buf.WriteByte('_')
  130. case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
  131. buf.WriteByte(b)
  132. default:
  133. buf.WriteByte('=')
  134. buf.WriteByte(upperhex[b>>4])
  135. buf.WriteByte(upperhex[b&0x0f])
  136. }
  137. }
  138. }
  139. // openWord writes the beginning of an encoded-word into buf.
  140. func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
  141. buf.WriteString("=?")
  142. buf.WriteString(charset)
  143. buf.WriteByte('?')
  144. buf.WriteByte(byte(e))
  145. buf.WriteByte('?')
  146. }
  147. // closeWord writes the end of an encoded-word into buf.
  148. func closeWord(buf *strings.Builder) {
  149. buf.WriteString("?=")
  150. }
  151. // splitWord closes the current encoded-word and opens a new one.
  152. func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
  153. closeWord(buf)
  154. buf.WriteByte(' ')
  155. e.openWord(buf, charset)
  156. }
  157. func isUTF8(charset string) bool {
  158. return strings.EqualFold(charset, "UTF-8")
  159. }
  160. const upperhex = "0123456789ABCDEF"
  161. // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
  162. type WordDecoder struct {
  163. // CharsetReader, if non-nil, defines a function to generate
  164. // charset-conversion readers, converting from the provided
  165. // charset into UTF-8.
  166. // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
  167. // are handled by default.
  168. // One of the CharsetReader's result values must be non-nil.
  169. CharsetReader func(charset string, input io.Reader) (io.Reader, error)
  170. }
  171. // Decode decodes an RFC 2047 encoded-word.
  172. func (d *WordDecoder) Decode(word string) (string, error) {
  173. // See https://tools.ietf.org/html/rfc2047#section-2 for details.
  174. // Our decoder is permissive, we accept empty encoded-text.
  175. if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
  176. return "", errInvalidWord
  177. }
  178. word = word[2 : len(word)-2]
  179. // split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
  180. charset, text, _ := strings.Cut(word, "?")
  181. if charset == "" {
  182. return "", errInvalidWord
  183. }
  184. encoding, text, _ := strings.Cut(text, "?")
  185. if len(encoding) != 1 {
  186. return "", errInvalidWord
  187. }
  188. content, err := decode(encoding[0], text)
  189. if err != nil {
  190. return "", err
  191. }
  192. var buf strings.Builder
  193. if err := d.convert(&buf, charset, content); err != nil {
  194. return "", err
  195. }
  196. return buf.String(), nil
  197. }
  198. // DecodeHeader decodes all encoded-words of the given string. It returns an
  199. // error if and only if CharsetReader of d returns an error.
  200. func (d *WordDecoder) DecodeHeader(header string) (string, error) {
  201. // If there is no encoded-word, returns before creating a buffer.
  202. i := strings.Index(header, "=?")
  203. if i == -1 {
  204. return header, nil
  205. }
  206. var buf strings.Builder
  207. buf.WriteString(header[:i])
  208. header = header[i:]
  209. betweenWords := false
  210. for {
  211. start := strings.Index(header, "=?")
  212. if start == -1 {
  213. break
  214. }
  215. cur := start + len("=?")
  216. i := strings.Index(header[cur:], "?")
  217. if i == -1 {
  218. break
  219. }
  220. charset := header[cur : cur+i]
  221. cur += i + len("?")
  222. if len(header) < cur+len("Q??=") {
  223. break
  224. }
  225. encoding := header[cur]
  226. cur++
  227. if header[cur] != '?' {
  228. break
  229. }
  230. cur++
  231. j := strings.Index(header[cur:], "?=")
  232. if j == -1 {
  233. break
  234. }
  235. text := header[cur : cur+j]
  236. end := cur + j + len("?=")
  237. content, err := decode(encoding, text)
  238. if err != nil {
  239. betweenWords = false
  240. buf.WriteString(header[:start+2])
  241. header = header[start+2:]
  242. continue
  243. }
  244. // Write characters before the encoded-word. White-space and newline
  245. // characters separating two encoded-words must be deleted.
  246. if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
  247. buf.WriteString(header[:start])
  248. }
  249. if err := d.convert(&buf, charset, content); err != nil {
  250. return "", err
  251. }
  252. header = header[end:]
  253. betweenWords = true
  254. }
  255. if len(header) > 0 {
  256. buf.WriteString(header)
  257. }
  258. return buf.String(), nil
  259. }
  260. func decode(encoding byte, text string) ([]byte, error) {
  261. switch encoding {
  262. case 'B', 'b':
  263. return base64.StdEncoding.DecodeString(text)
  264. case 'Q', 'q':
  265. return qDecode(text)
  266. default:
  267. return nil, errInvalidWord
  268. }
  269. }
  270. func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
  271. switch {
  272. case strings.EqualFold("utf-8", charset):
  273. buf.Write(content)
  274. case strings.EqualFold("iso-8859-1", charset):
  275. for _, c := range content {
  276. buf.WriteRune(rune(c))
  277. }
  278. case strings.EqualFold("us-ascii", charset):
  279. for _, c := range content {
  280. if c >= utf8.RuneSelf {
  281. buf.WriteRune(unicode.ReplacementChar)
  282. } else {
  283. buf.WriteByte(c)
  284. }
  285. }
  286. case strings.EqualFold("gb18030", charset):
  287. decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes(content)
  288. if err != nil {
  289. return err
  290. }
  291. buf.Write(decodeBytes)
  292. case strings.EqualFold("gbk", charset):
  293. decodeBytes, err := simplifiedchinese.GBK.NewDecoder().Bytes(content)
  294. if err != nil {
  295. return err
  296. }
  297. buf.Write(decodeBytes)
  298. default:
  299. if d.CharsetReader == nil {
  300. return fmt.Errorf("mime: unhandled charset %q", charset)
  301. }
  302. r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
  303. if err != nil {
  304. return err
  305. }
  306. if _, err = io.Copy(buf, r); err != nil {
  307. return err
  308. }
  309. }
  310. return nil
  311. }
  312. // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
  313. // one byte of non-whitespace.
  314. func hasNonWhitespace(s string) bool {
  315. for _, b := range s {
  316. switch b {
  317. // Encoded-words can only be separated by linear white spaces which does
  318. // not include vertical tabs (\v).
  319. case ' ', '\t', '\n', '\r':
  320. default:
  321. return true
  322. }
  323. }
  324. return false
  325. }
  326. // qDecode decodes a Q encoded string.
  327. func qDecode(s string) ([]byte, error) {
  328. dec := make([]byte, len(s))
  329. n := 0
  330. for i := 0; i < len(s); i++ {
  331. switch c := s[i]; {
  332. case c == '_':
  333. dec[n] = ' '
  334. case c == '=':
  335. if i+2 >= len(s) {
  336. return nil, errInvalidWord
  337. }
  338. b, err := readHexByte(s[i+1], s[i+2])
  339. if err != nil {
  340. return nil, err
  341. }
  342. dec[n] = b
  343. i += 2
  344. case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
  345. dec[n] = c
  346. default:
  347. return nil, errInvalidWord
  348. }
  349. n++
  350. }
  351. return dec[:n], nil
  352. }
  353. // readHexByte returns the byte from its quoted-printable representation.
  354. func readHexByte(a, b byte) (byte, error) {
  355. var hb, lb byte
  356. var err error
  357. if hb, err = fromHex(a); err != nil {
  358. return 0, err
  359. }
  360. if lb, err = fromHex(b); err != nil {
  361. return 0, err
  362. }
  363. return hb<<4 | lb, nil
  364. }
  365. func fromHex(b byte) (byte, error) {
  366. switch {
  367. case b >= '0' && b <= '9':
  368. return b - '0', nil
  369. case b >= 'A' && b <= 'F':
  370. return b - 'A' + 10, nil
  371. // Accept badly encoded bytes.
  372. case b >= 'a' && b <= 'f':
  373. return b - 'a' + 10, nil
  374. }
  375. return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
  376. }