Java regex filter for HTML img tag 

Joined:
04/09/2007
Posts:
753

November 11, 2011 10:05:22    Last update: November 11, 2011 10:12:01
This is an HTML image tag filter using Java regex. It takes a string, finds the img tags, replaces the src attribute with one provided by the filter, then adds a class name to the class attribute.
import java.util.regex.*;
import java.io.*;

public class ImgTagFilter {
    private static final Pattern IMG_PATTERN = Pattern.compile(
                                               "<img(\\s+.*?)(?:src\\s*=\\s*(?:'|\")(.*?)(?:'|\"))(.*?)/>", 
                                               Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
    private static final Pattern CLASS_PATTERN = Pattern.compile(
                                               "class\\s*=\\s*(?:'|\")(.*?)(?:'|\")", 
                                               Pattern.DOTALL|Pattern.CASE_INSENSITIVE);

    public static void main(String[] args) throws Exception {
	ImgTagFilter filter = new ImgTagFilter();
	String str = filter.getFileContents("img-parse-test.html");
	System.out.println(filter.rewriteImgTag(str));
    }

    public String rewriteImgTag(String str) {
	StringBuilder sb = new StringBuilder();
	Matcher m = IMG_PATTERN.matcher(str);
	int start = 0, end = 0;
	while (m.find()) { // find next match
	    end = m.start();
	    sb.append(str.substring(start, end));

	    sb.append("<img");
	    boolean classExists = false;
	    for (int i = 1; i < m.groupCount() + 1; i++) {
		if (i == 2) { // image src
		    sb.append(" src=\"MY_IMAGE_SRC_REPLACEMENT\"");
		    continue;
		}

		Matcher mc = CLASS_PATTERN.matcher(m.group(i));
		if (mc.find()) {
		    classExists = true;
		    sb.append(m.group(i).substring(0, mc.start()));
		    sb.append(" class=\"" + mc.group(1) + " MY_CLASS\"");
		    sb.append(m.group(i).substring(mc.end()));
		}
		else {
		    sb.append(m.group(i));
		}
	    }

	    if (!classExists) {
		sb.append(" class=\"MY_CLASS\"");
	    }

	    start = m.end();
	}
	sb.append(str.substring(start, str.length()));
	return sb.toString();
    }

    String getFileContents(String fileName) throws Exception {
	File theFile = new File(fileName);
	byte[] bytes = new byte[(int) theFile.length()];
	InputStream in = new FileInputStream(theFile);
	int m = 0, n = 0;
	while (m < bytes.length) {
	    n = in.read(bytes, m, bytes.length - m);
	    m += n;
	}
	in.close();

	return new String(bytes);
    }
}

Test file:
<div id="HTML snippet">
<img src="img/big/txt-marquee-shop-online.png" class="slider-1-2-3" alt="Simplify your online experience with 3-click checkout!" />
<p>somthignkg</p>
<ul>
<li>List 1</li>
<li>List 2</li>
<li>List 3</li>
</ul>
<img class="my class" 
src="images/theImage.gif" alt="Title"/>
<img 
style="border: #ddd 1px solid; color: #aaa;"
src="http://www.ags.com/img/myImage.png"
alt="another title"/>
trailer
</div>
Share |
| Comment  | Tags