html2text.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. package html2text
  2. import (
  3. "bytes"
  4. "io"
  5. "regexp"
  6. "strings"
  7. "unicode"
  8. "github.com/olekukonko/tablewriter"
  9. "github.com/ssor/bom"
  10. "golang.org/x/net/html"
  11. "golang.org/x/net/html/atom"
  12. )
  13. // Options provide toggles and overrides to control specific rendering behaviors.
  14. type Options struct {
  15. PrettyTables bool // Turns on pretty ASCII rendering for table elements.
  16. }
  17. // FromHTMLNode renders text output from a pre-parsed HTML document.
  18. func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
  19. var options Options
  20. if len(o) > 0 {
  21. options = o[0]
  22. }
  23. ctx := textifyTraverseContext{
  24. buf: bytes.Buffer{},
  25. options: options,
  26. }
  27. if err := ctx.traverse(doc); err != nil {
  28. return "", err
  29. }
  30. text := strings.TrimSpace(newlineRe.ReplaceAllString(
  31. strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
  32. )
  33. return text, nil
  34. }
  35. // FromReader renders text output after parsing HTML for the specified
  36. // io.Reader.
  37. func FromReader(reader io.Reader, options ...Options) (string, error) {
  38. newReader, err := bom.NewReaderWithoutBom(reader)
  39. if err != nil {
  40. return "", err
  41. }
  42. doc, err := html.Parse(newReader)
  43. if err != nil {
  44. return "", err
  45. }
  46. return FromHTMLNode(doc, options...)
  47. }
  48. // FromString parses HTML from the input string, then renders the text form.
  49. func FromString(input string, options ...Options) (string, error) {
  50. bs := bom.CleanBom([]byte(input))
  51. text, err := FromReader(bytes.NewReader(bs), options...)
  52. if err != nil {
  53. return "", err
  54. }
  55. return text, nil
  56. }
  57. var (
  58. spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
  59. newlineRe = regexp.MustCompile(`\n\n+`)
  60. )
  61. // traverseTableCtx holds text-related context.
  62. type textifyTraverseContext struct {
  63. buf bytes.Buffer
  64. prefix string
  65. tableCtx tableTraverseContext
  66. options Options
  67. endsWithSpace bool
  68. justClosedDiv bool
  69. blockquoteLevel int
  70. lineLength int
  71. }
  72. // tableTraverseContext holds table ASCII-form related context.
  73. type tableTraverseContext struct {
  74. header []string
  75. body [][]string
  76. footer []string
  77. tmpRow int
  78. isInFooter bool
  79. }
  80. func (tableCtx *tableTraverseContext) init() {
  81. tableCtx.body = [][]string{}
  82. tableCtx.header = []string{}
  83. tableCtx.footer = []string{}
  84. tableCtx.isInFooter = false
  85. tableCtx.tmpRow = 0
  86. }
  87. func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
  88. ctx.justClosedDiv = false
  89. switch node.DataAtom {
  90. case atom.Br:
  91. return ctx.emit("\n")
  92. case atom.H1, atom.H2, atom.H3:
  93. subCtx := textifyTraverseContext{}
  94. if err := subCtx.traverseChildren(node); err != nil {
  95. return err
  96. }
  97. str := subCtx.buf.String()
  98. dividerLen := 0
  99. for _, line := range strings.Split(str, "\n") {
  100. if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
  101. dividerLen = lineLen - 1
  102. }
  103. }
  104. var divider string
  105. if node.DataAtom == atom.H1 {
  106. divider = strings.Repeat("*", dividerLen)
  107. } else {
  108. divider = strings.Repeat("-", dividerLen)
  109. }
  110. if node.DataAtom == atom.H3 {
  111. return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
  112. }
  113. return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
  114. case atom.Blockquote:
  115. ctx.blockquoteLevel++
  116. ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
  117. if err := ctx.emit("\n"); err != nil {
  118. return err
  119. }
  120. if ctx.blockquoteLevel == 1 {
  121. if err := ctx.emit("\n"); err != nil {
  122. return err
  123. }
  124. }
  125. if err := ctx.traverseChildren(node); err != nil {
  126. return err
  127. }
  128. ctx.blockquoteLevel--
  129. ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
  130. if ctx.blockquoteLevel > 0 {
  131. ctx.prefix += " "
  132. }
  133. return ctx.emit("\n\n")
  134. case atom.Div:
  135. if ctx.lineLength > 0 {
  136. if err := ctx.emit("\n"); err != nil {
  137. return err
  138. }
  139. }
  140. if err := ctx.traverseChildren(node); err != nil {
  141. return err
  142. }
  143. var err error
  144. if !ctx.justClosedDiv {
  145. err = ctx.emit("\n")
  146. }
  147. ctx.justClosedDiv = true
  148. return err
  149. case atom.Li:
  150. if err := ctx.emit("* "); err != nil {
  151. return err
  152. }
  153. if err := ctx.traverseChildren(node); err != nil {
  154. return err
  155. }
  156. return ctx.emit("\n")
  157. case atom.B, atom.Strong:
  158. subCtx := textifyTraverseContext{}
  159. subCtx.endsWithSpace = true
  160. if err := subCtx.traverseChildren(node); err != nil {
  161. return err
  162. }
  163. str := subCtx.buf.String()
  164. return ctx.emit("*" + str + "*")
  165. case atom.A:
  166. // If image is the only child, take its alt text as the link text.
  167. if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
  168. if altText := getAttrVal(img, "alt"); altText != "" {
  169. if err := ctx.emit(altText); err != nil {
  170. return err
  171. }
  172. }
  173. } else if err := ctx.traverseChildren(node); err != nil {
  174. return err
  175. }
  176. hrefLink := ""
  177. if attrVal := getAttrVal(node, "href"); attrVal != "" {
  178. attrVal = ctx.normalizeHrefLink(attrVal)
  179. if attrVal != "" {
  180. hrefLink = "( " + attrVal + " )"
  181. }
  182. }
  183. return ctx.emit(hrefLink)
  184. case atom.P, atom.Ul:
  185. return ctx.paragraphHandler(node)
  186. case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td:
  187. if ctx.options.PrettyTables {
  188. return ctx.handleTableElement(node)
  189. } else if node.DataAtom == atom.Table {
  190. return ctx.paragraphHandler(node)
  191. }
  192. return ctx.traverseChildren(node)
  193. case atom.Style, atom.Script, atom.Head:
  194. // Ignore the subtree.
  195. return nil
  196. default:
  197. return ctx.traverseChildren(node)
  198. }
  199. }
  200. // paragraphHandler renders node children surrounded by double newlines.
  201. func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error {
  202. if err := ctx.emit("\n\n"); err != nil {
  203. return err
  204. }
  205. if err := ctx.traverseChildren(node); err != nil {
  206. return err
  207. }
  208. return ctx.emit("\n\n")
  209. }
  210. // handleTableElement is only to be invoked when options.PrettyTables is active.
  211. func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error {
  212. if !ctx.options.PrettyTables {
  213. panic("handleTableElement invoked when PrettyTables not active")
  214. }
  215. switch node.DataAtom {
  216. case atom.Table:
  217. if err := ctx.emit("\n\n"); err != nil {
  218. return err
  219. }
  220. // Re-intialize all table context.
  221. ctx.tableCtx.init()
  222. // Browse children, enriching context with table data.
  223. if err := ctx.traverseChildren(node); err != nil {
  224. return err
  225. }
  226. buf := &bytes.Buffer{}
  227. table := tablewriter.NewWriter(buf)
  228. table.SetHeader(ctx.tableCtx.header)
  229. table.SetFooter(ctx.tableCtx.footer)
  230. table.AppendBulk(ctx.tableCtx.body)
  231. // Render the table using ASCII.
  232. table.Render()
  233. if err := ctx.emit(buf.String()); err != nil {
  234. return err
  235. }
  236. return ctx.emit("\n\n")
  237. case atom.Tfoot:
  238. ctx.tableCtx.isInFooter = true
  239. if err := ctx.traverseChildren(node); err != nil {
  240. return err
  241. }
  242. ctx.tableCtx.isInFooter = false
  243. case atom.Tr:
  244. ctx.tableCtx.body = append(ctx.tableCtx.body, []string{})
  245. if err := ctx.traverseChildren(node); err != nil {
  246. return err
  247. }
  248. ctx.tableCtx.tmpRow++
  249. case atom.Th:
  250. res, err := ctx.renderEachChild(node)
  251. if err != nil {
  252. return err
  253. }
  254. ctx.tableCtx.header = append(ctx.tableCtx.header, res)
  255. case atom.Td:
  256. res, err := ctx.renderEachChild(node)
  257. if err != nil {
  258. return err
  259. }
  260. if ctx.tableCtx.isInFooter {
  261. ctx.tableCtx.footer = append(ctx.tableCtx.footer, res)
  262. } else {
  263. ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res)
  264. }
  265. }
  266. return nil
  267. }
  268. func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
  269. switch node.Type {
  270. default:
  271. return ctx.traverseChildren(node)
  272. case html.TextNode:
  273. data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
  274. return ctx.emit(data)
  275. case html.ElementNode:
  276. return ctx.handleElement(node)
  277. }
  278. }
  279. func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
  280. for c := node.FirstChild; c != nil; c = c.NextSibling {
  281. if err := ctx.traverse(c); err != nil {
  282. return err
  283. }
  284. }
  285. return nil
  286. }
  287. func (ctx *textifyTraverseContext) emit(data string) error {
  288. if data == "" {
  289. return nil
  290. }
  291. var (
  292. lines = ctx.breakLongLines(data)
  293. err error
  294. )
  295. for _, line := range lines {
  296. runes := []rune(line)
  297. startsWithSpace := unicode.IsSpace(runes[0])
  298. if !startsWithSpace && !ctx.endsWithSpace {
  299. if err = ctx.buf.WriteByte(' '); err != nil {
  300. return err
  301. }
  302. ctx.lineLength++
  303. }
  304. ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
  305. for _, c := range line {
  306. if _, err = ctx.buf.WriteString(string(c)); err != nil {
  307. return err
  308. }
  309. ctx.lineLength++
  310. if c == '\n' {
  311. ctx.lineLength = 0
  312. if ctx.prefix != "" {
  313. if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
  314. return err
  315. }
  316. }
  317. }
  318. }
  319. }
  320. return nil
  321. }
  322. const maxLineLen = 74
  323. func (ctx *textifyTraverseContext) breakLongLines(data string) []string {
  324. // Only break lines when in blockquotes.
  325. if ctx.blockquoteLevel == 0 {
  326. return []string{data}
  327. }
  328. var (
  329. ret = []string{}
  330. runes = []rune(data)
  331. l = len(runes)
  332. existing = ctx.lineLength
  333. )
  334. if existing >= maxLineLen {
  335. ret = append(ret, "\n")
  336. existing = 0
  337. }
  338. for l+existing > maxLineLen {
  339. i := maxLineLen - existing
  340. for i >= 0 && !unicode.IsSpace(runes[i]) {
  341. i--
  342. }
  343. if i == -1 {
  344. // No spaces, so go the other way.
  345. i = maxLineLen - existing
  346. for i < l && !unicode.IsSpace(runes[i]) {
  347. i++
  348. }
  349. }
  350. ret = append(ret, string(runes[:i])+"\n")
  351. for i < l && unicode.IsSpace(runes[i]) {
  352. i++
  353. }
  354. runes = runes[i:]
  355. l = len(runes)
  356. existing = 0
  357. }
  358. if len(runes) > 0 {
  359. ret = append(ret, string(runes))
  360. }
  361. return ret
  362. }
  363. func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
  364. link = strings.TrimSpace(link)
  365. link = strings.TrimPrefix(link, "mailto:")
  366. return link
  367. }
  368. // renderEachChild visits each direct child of a node and collects the sequence of
  369. // textuual representaitons separated by a single newline.
  370. func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {
  371. buf := &bytes.Buffer{}
  372. for c := node.FirstChild; c != nil; c = c.NextSibling {
  373. s, err := FromHTMLNode(c, ctx.options)
  374. if err != nil {
  375. return "", err
  376. }
  377. if _, err = buf.WriteString(s); err != nil {
  378. return "", err
  379. }
  380. if c.NextSibling != nil {
  381. if err = buf.WriteByte('\n'); err != nil {
  382. return "", err
  383. }
  384. }
  385. }
  386. return buf.String(), nil
  387. }
  388. func getAttrVal(node *html.Node, attrName string) string {
  389. for _, attr := range node.Attr {
  390. if attr.Key == attrName {
  391. return attr.Val
  392. }
  393. }
  394. return ""
  395. }