| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426 |
- package parsemail
- // copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.4:src/mime/encodedword.go
- // Golang官方库的解码函数不支持中文编码,此处实现支持了中文gbk和gb18030编码
- import (
- "bytes"
- "encoding/base64"
- "errors"
- "fmt"
- "golang.org/x/text/encoding/simplifiedchinese"
- "io"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- // A WordEncoder is an RFC 2047 encoded-word encoder.
- type WordEncoder byte
- const (
- // BEncoding represents Base64 encoding scheme as defined by RFC 2045.
- BEncoding = WordEncoder('b')
- // QEncoding represents the Q-encoding scheme as defined by RFC 2047.
- QEncoding = WordEncoder('q')
- )
- var (
- errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
- )
- // Encode returns the encoded-word form of s. If s is ASCII without special
- // characters, it is returned unchanged. The provided charset is the IANA
- // charset name of s. It is case insensitive.
- func (e WordEncoder) Encode(charset, s string) string {
- if !needsEncoding(s) {
- return s
- }
- return e.encodeWord(charset, s)
- }
- func needsEncoding(s string) bool {
- for _, b := range s {
- if (b < ' ' || b > '~') && b != '\t' {
- return true
- }
- }
- return false
- }
- // encodeWord encodes a string into an encoded-word.
- func (e WordEncoder) encodeWord(charset, s string) string {
- var buf strings.Builder
- // Could use a hint like len(s)*3, but that's not enough for cases
- // with word splits and too much for simpler inputs.
- // 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
- buf.Grow(48)
- e.openWord(&buf, charset)
- if e == BEncoding {
- e.bEncode(&buf, charset, s)
- } else {
- e.qEncode(&buf, charset, s)
- }
- closeWord(&buf)
- return buf.String()
- }
- const (
- // The maximum length of an encoded-word is 75 characters.
- // See RFC 2047, section 2.
- maxEncodedWordLen = 75
- // maxContentLen is how much content can be encoded, ignoring the header and
- // 2-byte footer.
- maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
- )
- var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
- // bEncode encodes s using base64 encoding and writes it to buf.
- func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
- w := base64.NewEncoder(base64.StdEncoding, buf)
- // If the charset is not UTF-8 or if the content is short, do not bother
- // splitting the encoded-word.
- if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
- io.WriteString(w, s)
- w.Close()
- return
- }
- var currentLen, last, runeLen int
- for i := 0; i < len(s); i += runeLen {
- // Multi-byte characters must not be split across encoded-words.
- // See RFC 2047, section 5.3.
- _, runeLen = utf8.DecodeRuneInString(s[i:])
- if currentLen+runeLen <= maxBase64Len {
- currentLen += runeLen
- } else {
- io.WriteString(w, s[last:i])
- w.Close()
- e.splitWord(buf, charset)
- last = i
- currentLen = runeLen
- }
- }
- io.WriteString(w, s[last:])
- w.Close()
- }
- // qEncode encodes s using Q encoding and writes it to buf. It splits the
- // encoded-words when necessary.
- func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
- // We only split encoded-words when the charset is UTF-8.
- if !isUTF8(charset) {
- writeQString(buf, s)
- return
- }
- var currentLen, runeLen int
- for i := 0; i < len(s); i += runeLen {
- b := s[i]
- // Multi-byte characters must not be split across encoded-words.
- // See RFC 2047, section 5.3.
- var encLen int
- if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
- runeLen, encLen = 1, 1
- } else {
- _, runeLen = utf8.DecodeRuneInString(s[i:])
- encLen = 3 * runeLen
- }
- if currentLen+encLen > maxContentLen {
- e.splitWord(buf, charset)
- currentLen = 0
- }
- writeQString(buf, s[i:i+runeLen])
- currentLen += encLen
- }
- }
- // writeQString encodes s using Q encoding and writes it to buf.
- func writeQString(buf *strings.Builder, s string) {
- for i := 0; i < len(s); i++ {
- switch b := s[i]; {
- case b == ' ':
- buf.WriteByte('_')
- case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
- buf.WriteByte(b)
- default:
- buf.WriteByte('=')
- buf.WriteByte(upperhex[b>>4])
- buf.WriteByte(upperhex[b&0x0f])
- }
- }
- }
- // openWord writes the beginning of an encoded-word into buf.
- func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
- buf.WriteString("=?")
- buf.WriteString(charset)
- buf.WriteByte('?')
- buf.WriteByte(byte(e))
- buf.WriteByte('?')
- }
- // closeWord writes the end of an encoded-word into buf.
- func closeWord(buf *strings.Builder) {
- buf.WriteString("?=")
- }
- // splitWord closes the current encoded-word and opens a new one.
- func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
- closeWord(buf)
- buf.WriteByte(' ')
- e.openWord(buf, charset)
- }
- func isUTF8(charset string) bool {
- return strings.EqualFold(charset, "UTF-8")
- }
- const upperhex = "0123456789ABCDEF"
- // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
- type WordDecoder struct {
- // CharsetReader, if non-nil, defines a function to generate
- // charset-conversion readers, converting from the provided
- // charset into UTF-8.
- // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
- // are handled by default.
- // One of the CharsetReader's result values must be non-nil.
- CharsetReader func(charset string, input io.Reader) (io.Reader, error)
- }
- // Decode decodes an RFC 2047 encoded-word.
- func (d *WordDecoder) Decode(word string) (string, error) {
- // See https://tools.ietf.org/html/rfc2047#section-2 for details.
- // Our decoder is permissive, we accept empty encoded-text.
- if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
- return "", errInvalidWord
- }
- word = word[2 : len(word)-2]
- // split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
- charset, text, _ := strings.Cut(word, "?")
- if charset == "" {
- return "", errInvalidWord
- }
- encoding, text, _ := strings.Cut(text, "?")
- if len(encoding) != 1 {
- return "", errInvalidWord
- }
- content, err := decode(encoding[0], text)
- if err != nil {
- return "", err
- }
- var buf strings.Builder
- if err := d.convert(&buf, charset, content); err != nil {
- return "", err
- }
- return buf.String(), nil
- }
- // DecodeHeader decodes all encoded-words of the given string. It returns an
- // error if and only if CharsetReader of d returns an error.
- func (d *WordDecoder) DecodeHeader(header string) (string, error) {
- // If there is no encoded-word, returns before creating a buffer.
- i := strings.Index(header, "=?")
- if i == -1 {
- return header, nil
- }
- var buf strings.Builder
- buf.WriteString(header[:i])
- header = header[i:]
- betweenWords := false
- for {
- start := strings.Index(header, "=?")
- if start == -1 {
- break
- }
- cur := start + len("=?")
- i := strings.Index(header[cur:], "?")
- if i == -1 {
- break
- }
- charset := header[cur : cur+i]
- cur += i + len("?")
- if len(header) < cur+len("Q??=") {
- break
- }
- encoding := header[cur]
- cur++
- if header[cur] != '?' {
- break
- }
- cur++
- j := strings.Index(header[cur:], "?=")
- if j == -1 {
- break
- }
- text := header[cur : cur+j]
- end := cur + j + len("?=")
- content, err := decode(encoding, text)
- if err != nil {
- betweenWords = false
- buf.WriteString(header[:start+2])
- header = header[start+2:]
- continue
- }
- // Write characters before the encoded-word. White-space and newline
- // characters separating two encoded-words must be deleted.
- if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
- buf.WriteString(header[:start])
- }
- if err := d.convert(&buf, charset, content); err != nil {
- return "", err
- }
- header = header[end:]
- betweenWords = true
- }
- if len(header) > 0 {
- buf.WriteString(header)
- }
- return buf.String(), nil
- }
- func decode(encoding byte, text string) ([]byte, error) {
- switch encoding {
- case 'B', 'b':
- return base64.StdEncoding.DecodeString(text)
- case 'Q', 'q':
- return qDecode(text)
- default:
- return nil, errInvalidWord
- }
- }
- func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
- switch {
- case strings.EqualFold("utf-8", charset):
- buf.Write(content)
- case strings.EqualFold("iso-8859-1", charset):
- for _, c := range content {
- buf.WriteRune(rune(c))
- }
- case strings.EqualFold("us-ascii", charset):
- for _, c := range content {
- if c >= utf8.RuneSelf {
- buf.WriteRune(unicode.ReplacementChar)
- } else {
- buf.WriteByte(c)
- }
- }
- case strings.EqualFold("gb18030", charset):
- decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes(content)
- if err != nil {
- return err
- }
- buf.Write(decodeBytes)
- case strings.EqualFold("gbk", charset):
- decodeBytes, err := simplifiedchinese.GBK.NewDecoder().Bytes(content)
- if err != nil {
- return err
- }
- buf.Write(decodeBytes)
- default:
- if d.CharsetReader == nil {
- return fmt.Errorf("mime: unhandled charset %q", charset)
- }
- r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
- if err != nil {
- return err
- }
- if _, err = io.Copy(buf, r); err != nil {
- return err
- }
- }
- return nil
- }
- // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
- // one byte of non-whitespace.
- func hasNonWhitespace(s string) bool {
- for _, b := range s {
- switch b {
- // Encoded-words can only be separated by linear white spaces which does
- // not include vertical tabs (\v).
- case ' ', '\t', '\n', '\r':
- default:
- return true
- }
- }
- return false
- }
- // qDecode decodes a Q encoded string.
- func qDecode(s string) ([]byte, error) {
- dec := make([]byte, len(s))
- n := 0
- for i := 0; i < len(s); i++ {
- switch c := s[i]; {
- case c == '_':
- dec[n] = ' '
- case c == '=':
- if i+2 >= len(s) {
- return nil, errInvalidWord
- }
- b, err := readHexByte(s[i+1], s[i+2])
- if err != nil {
- return nil, err
- }
- dec[n] = b
- i += 2
- case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
- dec[n] = c
- default:
- return nil, errInvalidWord
- }
- n++
- }
- return dec[:n], nil
- }
- // readHexByte returns the byte from its quoted-printable representation.
- func readHexByte(a, b byte) (byte, error) {
- var hb, lb byte
- var err error
- if hb, err = fromHex(a); err != nil {
- return 0, err
- }
- if lb, err = fromHex(b); err != nil {
- return 0, err
- }
- return hb<<4 | lb, nil
- }
- func fromHex(b byte) (byte, error) {
- switch {
- case b >= '0' && b <= '9':
- return b - '0', nil
- case b >= 'A' && b <= 'F':
- return b - 'A' + 10, nil
- // Accept badly encoded bytes.
- case b >= 'a' && b <= 'f':
- return b - 'a' + 10, nil
- }
- return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
- }
|