QQ账号登录	立即注册>>

您现在的位置：论坛 › 盖世程序员(我猜到了开头却没有猜到结局) › 运维优化 > 数据库html转为text文本内容并保留格式

总共48087条微博

动态微博

发表新帖

查看: 2548|回复: 0

数据库html转为text文本内容并保留格式

admin

1244 主题	544 听众	1万金钱

管理员

TA的每日心情

	衰 2021-2-2 11:21

签到天数: 36 天

[LV.5]常住居民I

电梯直达

楼主

发表于 2014-04-21 20:36:15 |只看该作者 |倒序浏览

package javaOrJs;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.HashMap;
import java.util.Map;
/**
* Copyright (c) 2010-2013 by 52itstyle
* All rights reserved.
*/
/**
*@Function: html 转为纯文本保留格式
*@Class Name: WebFormatter
*@Author: zhangZhiPeng
*@Date: 2013-10-29
*@Modifications:
*@Modifier Name; Date; The Reason for Modifying
*
*/
public class WebFormatter{
public static void main(String[] args){
String content = html2text("<p>The Nobel(璇鸿礉灏? Peace Prize for 2008 was given to Martti Ahtisaari. He was the president of Finland from 1994 to 2000. He won the prize for his work in solving (瑙ｅ喅)international conflicts (鍐茬獊) for more than 30 years. </p><p> During all his life, both as president and as an international officer, he has worked for peace. For the past 20 years, he has done a lot to resolve several serious international conflicts. Some of these conflicts had lasted for years. In 1989-90, he played an important role in helping Namibia鈥檚 independence(鐙?珛); in 2005 he did his best to help solve the Aceh question in Indonesia. In 1999 and again in 2005-2007, under very difficult situation he found ways to help solve the conflict in Kosovo. In 2008, together with other organizations, he has tried to help solve many of the problems in Iraq. He has also made great contributions(璐＄尞) to solving the conflict in Northern Ireland, Central Asia, and on the Horn of Africa.</p><p> 鈥淭his work has made a more peaceful world in Nobel鈥檚 spirit,鈥?the officer said, 鈥渟o he has won the prize.鈥?</p>");
System.out.println(content);
String txtSrc = "D://tomcat6_beta//webapps//web01//file//paper//111.txt";
createTextFile(txtSrc,content);
}
public static void createTextFile(String src, String text) {
try {
FileWriter fw = new FileWriter(src);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(text);
bw.flush();
bw.close();
fw.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static String html2text(String html) {
StringBuffer sb = new StringBuffer(html.length());
char[] data = html.toCharArray();
int start = 0;
boolean previousIsPre = false;
Token token = null;
for(;;) {
token = parse(data, start, previousIsPre);
if(token==null)
break;
previousIsPre = token.isPreTag();
sb = sb.append(token.getText());
start += token.getLength();
}
return sb.toString();
}
private static Token parse(char[] data, int start, boolean previousIsPre) {
if(start>=data.length)
return null;
// try to read next char:
char c = data[start];
if(c=='<') {
// this is a tag or comment or script:
int end_index = indexOf(data, start+1, '>');
if(end_index==(-1)) {
// the left is all text!
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
}
String s = new String(data, start, end_index-start+1);
// now we got s="<...>":
if(s.startsWith("<!--")) { // this is a comment!
int end_comment_index = indexOf(data, start+1, "-->");
if(end_comment_index==(-1)) {
// illegal end, but treat as comment:
return new Token(Token.TOKEN_COMMENT, data, start, data.length, previousIsPre);
}
else
return new Token(Token.TOKEN_COMMENT, data, start, end_comment_index+3, previousIsPre);
}
String s_lowerCase = s.toLowerCase();
if(s_lowerCase.startsWith("<script")) { // this is a script:
int end_script_index = indexOf(data, start+1, "</script>");
if(end_script_index==(-1))
// illegal end, but treat as script:
return new Token(Token.TOKEN_SCRIPT, data, start, data.length, previousIsPre);
else
return new Token(Token.TOKEN_SCRIPT, data, start, end_script_index+9, previousIsPre);
}
else { // this is a tag:
return new Token(Token.TOKEN_TAG, data, start, start+s.length(), previousIsPre);
}
}
// this is a text:
int next_tag_index = indexOf(data, start+1, '<');
if(next_tag_index==(-1))
return new Token(Token.TOKEN_TEXT, data, start, data.length, previousIsPre);
return new Token(Token.TOKEN_TEXT, data, start, next_tag_index, previousIsPre);
}
private static int indexOf(char[] data, int start, String s) {
char[] ss = s.toCharArray();
// TODO: performance can improve!
for(int i=start; i<(data.length-ss.length); i++) {
// compare from data[i] with ss[0]:
boolean match = true;
for(int j=0; j<ss.length; j++) {
if(data[i+j]!=ss[j]) {
match = false;
break;
}
}
if(match)
return i;
}
return (-1);
}
private static int indexOf(char[] data, int start, char c) {
for(int i=start; i<data.length; i++) {
if(data[i]==c)
return i;
}
return (-1);
}
}
@SuppressWarnings("unchecked")
class Token {
public static final int TOKEN_TEXT = 0; // html text.
public static final int TOKEN_COMMENT = 1; // comment like
public static final int TOKEN_TAG = 2; // tag like <pre>, <font>, etc.
public static final int TOKEN_SCRIPT = 3;
private static final char[] TAG_BR = "<br".toCharArray();
private static final char[] TAG_P = "<p".toCharArray();
private static final char[] TAG_LI = "<li".toCharArray();
private static final char[] TAG_PRE = "<pre".toCharArray();
private static final char[] TAG_HR = "<hr".toCharArray();
private static final char[] END_TAG_TD = "</td>".toCharArray();
private static final char[] END_TAG_TR = "</tr>".toCharArray();
private static final char[] END_TAG_LI = "</li>".toCharArray();
private static final Map SPECIAL_CHARS = new HashMap();
private int type;
private String html; // original html
private String text = null; // text!
private int length = 0; // html length
private boolean isPre = false; // isPre tag?
static {
SPECIAL_CHARS.put(""", "/");
SPECIAL_CHARS.put("<", "<");
SPECIAL_CHARS.put(">", ">");
SPECIAL_CHARS.put("&", "&");
SPECIAL_CHARS.put("®", "(r)");
SPECIAL_CHARS.put("©", "(c)");
SPECIAL_CHARS.put(" ", " ");
SPECIAL_CHARS.put("￡", "?");
}
public Token(int type, char[] data, int start, int end, boolean previousIsPre) {
this.type = type;
this.length = end - start;
this.html = new String(data, start, length);
//System.out.println("[Token] html=" + html + ".");
parseText(previousIsPre);
// System.out.println("[Token] text=" + text + ".");
}
public int getLength() {
return length;
}
public boolean isPreTag() {
return isPre;
}
private void parseText(boolean previousIsPre) {
if(type==TOKEN_TAG) {
char[] cs = html.toCharArray();
if(compareTag(TAG_BR, cs) || compareTag(TAG_P, cs))
text = "\r\n";
else if(compareTag(TAG_LI, cs))
text = "\n* ";
else if(compareTag(TAG_PRE, cs))
isPre = true;
else if(compareTag(TAG_HR, cs))
text = "\n--------\n";
else if(compareString(END_TAG_TD, cs))
text = "\t";
else if(compareString(END_TAG_TR, cs) || compareString(END_TAG_LI, cs))
text = "\n";
}
// text token:
else if(type==TOKEN_TEXT) {
text = toText(html, previousIsPre);
}
}
public String getText() {
return text==null ? "" : text;
}
private String toText(String html, final boolean isPre) {
char[] cs = html.toCharArray();
StringBuffer buffer = new StringBuffer(cs.length);
int start = 0;
boolean continueSpace = false;
char current, next;
for(;;) {
if(start>=cs.length)
break;
current = cs[start]; // read current char
if(start+1<cs.length) // and next char
next = cs[start+1];
else
next = '\0';
if(current==' ') {
if(isPre || !continueSpace)
buffer = buffer.append(' ');
continueSpace = true;
// continue loop:
start++;
continue;
}
// not ' ', so:
if(current=='\r' && next=='\n') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start+=2;
continue;
}
if(current=='\n' || current=='\r') {
if(isPre)
buffer = buffer.append('\n');
// continue loop:
start++;
continue;
}
// cannot continue space:
continueSpace = false;
if(current=='&') {
// maybe special char:
int length = readUtil(cs, start, ';', 10);
if(length==(-1)) { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
else { // check if special character:
String spec = new String(cs, start, length);
String specChar = (String)SPECIAL_CHARS.get(spec);
if(specChar!=null) { // special chars!
buffer = buffer.append(specChar);
// continue loop:
start+=length;
continue;
}
else { // check if like 'Ӓ':
if(next=='#') { // maybe a char
String num = new String(cs, start+2, length-3);
try {
int code = Integer.parseInt(num);
if(code>0 && code<65536) { // this is a special char:
buffer = buffer.append((char)code);
// continue loop:
start++;
continue;
}
}
catch(Exception e) {}
// just normal char:
buffer = buffer.append("&#");
// continue loop:
start+=2;
continue;
}
else { // just '&':
buffer = buffer.append('&');
// continue loop:
start++;
continue;
}
}
}
}
else { // just a normal char!
buffer = buffer.append(current);
// continue loop:
start++;
continue;
}
}
return buffer.toString();
}
// read from cs[start] util meet the specified char 'util',
// or null if not found:
private int readUtil(final char[] cs, final int start, final char util, final int maxLength) {
int end = start+maxLength;
if(end>cs.length)
end = cs.length;
for(int i=start; i<start+maxLength; i++) {
if(cs[i]==util) {
return i-start+1;
}
}
return (-1);
}
// compare standard tag "<input" with tag "<INPUT value=aa>"
private boolean compareTag(final char[] ori_tag, char[] tag) {
if(ori_tag.length>=tag.length)
return false;
for(int i=0; i<ori_tag.length; i++) {
if(Character.toLowerCase(tag[i])!=ori_tag[i])
return false;
}
// the following char should not be a-z:
if(tag.length>ori_tag.length) {
char c = Character.toLowerCase(tag[ori_tag.length]);
if(c<'a' || c>'z')
return true;
return false;
}
return true;
}
private boolean compareString(final char[] ori, char[] comp) {
if(ori.length>comp.length)
return false;
for(int i=0; i<ori.length; i++) {
if(Character.toLowerCase(comp[i])!=ori[i])
return false;
}
return true;
}
public String toString() {
return html;
}
}

复制代码

科帮网 1、本主题所有言论和图片纯属会员个人意见，与本社区立场无关
2、本站所有主题由该帖子作者发表，该帖子作者与科帮网享有帖子相关版权
3、其他单位或个人使用、转载或引用本文时必须同时征得该帖子作者和科帮网的同意
4、帖子作者须承担一切因本文发表而直接或间接导致的民事或刑事法律责任
5、本帖部分内容转载自其它媒体，但并不代表本站赞同其观点和对其真实性负责
6、如本帖侵犯到任何版权问题，请立即告知本站，本站将及时予与删除并致以最深的歉意
7、科帮网管理员和版主有权不事先通知发贴者而删除本文