Found it 🙂
Assuming $content holds your entire HTML file
#-------------------------------------
# Strip HTML tags
#-------------------------------------
# Replace newlines with ***
$contents=~s/\n/\*\*\*/g;
$contents=~s/\n//g;
# strip "style" tags
$contents=~s/<style*?<\/style//gi;
# strip "script" tags
$contents=~s/<script.*?<\/script>//gi;
# strip "title" tag
$contents=~s/<title.*?<\/title>//gi;
# strip "noframes" tag
$contents=~s/<noframes.*?<\/noframes>//gi;
# strip special chars
$contents=~s/\&.*?;//gi;
# strip "other" tags
$contents=~s/<.*?>//gi;
# restore newlines
$contents=~s/\*\*\*/\n/g;
#-------------------------------------
# Re-assemble content of the HTML file
#-------------------------------------
@lines=split(/\n/,$contents);
$contents="";
foreach $line (@lines) {
$line=~ s/^\s*(.*?)\s*$/$1/g;
if (($line ne "")) {
$contents.="$line\n";
}
}
$SizeAfter=$SizeAfter+length($contents);
#-------------------------------------
if ($contents ne "") {
# your stuff here
}