CleanWordKeepsStructure

This website contains links to software which is either no longer maintained or will be supported only until the end of 2019 (CKFinder 2). For the latest documentation about current CKSource projects, including software like CKEditor 4/CKEditor 5, CKFinder 3, Cloud Services, Letters, Accessibility Checker, please visit the new documentation website.

If you look for an information about very old versions of CKEditor, FCKeditor and CKFinder check also the CKEditor forum, which was closed in 2015. If not, please head to StackOverflow for support.

CleanWordKeepsStructure

This setting controls the behavior in the Paste from Word dialog. It's default value is false, and that way it's trying to do the changes so the content looks like it was in Word. If it's switched to true then the routine will prefer to keep the HTML structure of the data instead of modifying it to keep the look as it was in word. Example:

FCKConfig.CleanWordKeepsStructure = true ;

Enabling this setting allows to keep a properly structured document as it was created in word as well as keep anchors and you can use CSS to make it look like it was in Word.

Additionally you can specify your own function for the clean up adding it to the FCK object: Define a FCK.CustomCleanWord function that accepts three parameters and returns the cleaned up string. Here's an example using the default function that will be called in the Paste dialog (you should add your function for example with a plugin):

// This function will be called from the PasteFromWord dialog (fck_paste.html)
// Input: oNode a DOM node that contains the raw paste from the clipboard
// bIgnoreFont, bRemoveStyles booleans according to the values set in the dialog
// Output: the cleaned string
function CleanWord( oNode, bIgnoreFont, bRemoveStyles )
{
	var html = oNode.innerHTML ;

	html = html.replace(/<o:p>\s*<\/o:p>/g, ) ;
	html = html.replace(/<o:p>.*?<\/o:p>/g, ' ') ;

	// Remove mso-xxx styles.
	html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi,  ) ;

	// Remove margin styles.
	html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi,  ) ;
	html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;

	html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi,  ) ;
	html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;

	html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;

	html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;

	html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;

	html = html.replace( /\s*tab-stops:[^;"]*;?/gi,  ) ;
	html = html.replace( /\s*tab-stops:[^"]*/gi,  ) ;

	// Remove FONT face attributes.
	if ( bIgnoreFont )
	{
		html = html.replace( /\s*face="[^"]*"/gi,  ) ;
		html = html.replace( /\s*face=[^ >]*/gi,  ) ;

		html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi,  ) ;
	}

	// Remove Class attributes
	html = html.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ;

	// Remove styles.
	if ( bRemoveStyles )
		html = html.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;

	// Remove empty styles.
	html =  html.replace( /\s*style="\s*"/gi,  ) ;

	html = html.replace( /<SPAN\s*[^>]*>\s* \s*<\/SPAN>/gi, ' ' ) ;

	html = html.replace( /<SPAN\s*[^>]*><\/SPAN>/gi,  ) ;

	// Remove Lang attributes
	html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;

	html = html.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ;

	html = html.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ;

	// Remove XML elements and declarations
	html = html.replace(/<\\?\?xml[^>]*>/gi,  ) ;

	// Remove Tags with XML namespace declarations: <o:p><\/o:p>
	html = html.replace(/<\/?\w+:[^>]*>/gi,  ) ;

	// Remove comments [SF BUG-1481861].
	html = html.replace(/<\!--.*?-->/g,  ) ;

	html = html.replace( /<(U|I|STRIKE)> <\/\1>/g, ' ' ) ;

	html = html.replace( /<H\d>\s*<\/H\d>/gi,  ) ;

	// Remove "display:none" tags.
	html = html.replace( /<(\w+)[^>]*\sstyle="[^"]*DISPLAY\s?:\s?none(.*?)<\/\1>/ig,  ) ;

	// Remove language tags
	html = html.replace( /<(\w[^>]*) language=([^ |>]*)([^>]*)/gi, "<$1$3") ;

	// Remove onmouseover and onmouseout events (from MS Word comments effect)
	html = html.replace( /<(\w[^>]*) onmouseover="([^\"]*)"([^>]*)/gi, "<$1$3") ;
	html = html.replace( /<(\w[^>]*) onmouseout="([^\"]*)"([^>]*)/gi, "<$1$3") ;

	if ( FCKConfig.CleanWordKeepsStructure )
	{
		// The original <Hn> tag send from Word is something like this: <Hn style="margin-top:0px;margin-bottom:0px">
		html = html.replace( /<H(\d)([^>]*)>/gi, '<h$1>' ) ;

		// Word likes to insert extra  tags, when using MSIE. (Wierd).
		html = html.replace( /<(H\d)><FONT[^>]*>(.*?)<\/FONT><\/\1>/gi, '<$1>$2<\/$1>' );
		html = html.replace( /<(H\d)>(.*?)<\/EM><\/\1>/gi, '<$1>$2<\/$1>' );
	}
	else
	{
		html = html.replace( /<H1([^>]*)>/gi, '<div$1>' ) ;
		html = html.replace( /<H2([^>]*)>/gi, '<div$1><b>' ) ;
		html = html.replace( /<H3([^>]*)>/gi, '<div$1><b>' ) ;
		html = html.replace( /<H4([^>]*)>/gi, '<div$1><b>' ) ;
		html = html.replace( /<H5([^>]*)>/gi, '<div$1><b>' ) ;
		html = html.replace( /<H6([^>]*)>/gi, '<div$1><b>' ) ;

		html = html.replace( /<\/H\d>/gi, '<\/font><\/b><\/div>' ) ;

		// Transform 

to

var re = new RegExp( '(<P)([^>]*>.*?)(<\/P>)', 'gi' ) ; // Different because of a IE 5.0 error html = html.replace( re, '<div$2<\/div>' ) ;

// Remove empty tags (three times, just to be sure). // This also removes any empty anchor html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, ) ; html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, ) ; html = html.replace( /<([^\s>]+)(\s[^>]*)?>\s*<\/\1>/g, ) ; }

return html ;

}

This page was last edited on 9 January 2008, at 11:19.