I wrote the following two functions for one of the CrabFire filters. They require no memory lookups so they will not pollute the data cache or hog the memory bus.
Code:
vector char vec_tolower(vector char str)
{
	/* From Holger Bettag's table of constants */
	vector char A         = vec_rl(vec_splat_u8(4), vec_splat_u8(4));
	vector char Z         = vec_vor(vec_rl(vec_splat_u8(0xb), vec_splat_u8(0xb)), vec_splat_u8(0xb));
	vector char diff      = vec_rl(vec_splat_u8(1), vec_splat_u8(5));
	vector bool char gt   = vec_cmpgt(str, A);
	vector bool char lt   = vec_cmplt(str, Z);
	vector bool char mask = vec_and(gt, lt);
	vector char small     = vec_add(str, diff);
	return vec_sel(str, small, mask);
}
Code:
vector char vec_toupper(vector char str)
{
	/* From Holger Bettag's table of constants */
	vector char a         = vec_rl(vec_splat_u8(3), vec_splat_u8(5));
	vector char z         = vec_avg(vec_splat_u8(0), vec_splat_u8(-13));
	vector char diff      = vec_rl(vec_splat_u8(1), vec_splat_u8(5));
	vector bool char gt   = vec_cmpgt(str, a);
	vector bool char lt   = vec_cmplt(str, z);
	vector bool char mask = vec_and(gt, lt);
	vector char small     = vec_sub(str, diff);
	return vec_sel(str, small, mask);
}