Hyperestraier Redux - A User-friendly Approach

http://localhost/search

cd /home/joe/XYZ
estcmd gather -sd searchXYZ . 

# Copy all ".txt" and ".html" files from my ~/www directory to /tmp/X
find ~/www -type f -size +0 -iregex '.*$html\|txt$' -exec cp {} /tmp/X \;
# How many files was that, anyway?
ls /tmp/X|wc -l
2924
# How big is all that stuff?
du -sh /tmp/X|cut -f1
342M

cd /tmp/X
estcmd gather -sd searchX . 

estcmd: INFO: reading list from the directory: .
estcmd: INFO: status: name=searchX dnum=0 wnum=0 fsiz=6899176 crnum=0 csiz=0 dknum=0
estcmd: INFO: 1 (/tmp/X/00ReadMe.txt): registered
estcmd: INFO: 2 (/tmp/X/00_READMEFIRST.txt): registered
estcmd: INFO: 3 (/tmp/X/A Time Comes In Your Life.txt): registered

[ ...skipping a few thousand lines ]

estcmd: INFO: 2922 (/tmp/X/yaw.html): registered
estcmd: INFO: 2923 (/tmp/X/youtube.html): registered
estcmd: INFO: 2924 (/tmp/X/zQdebit-orderform.html): registered
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=1 fsiz=41568116 crnum=157951 csiz=56815761 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=10001 fsiz=41935584 crnum=147951 csiz=55749775 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=20001 fsiz=45899931 crnum=137951 csiz=50521003 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=30001 fsiz=49897291 crnum=127951 csiz=45494307 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=40001 fsiz=52269735 crnum=117951 csiz=42341097 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=50001 fsiz=54037209 crnum=107951 csiz=39543361 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=60001 fsiz=55833455 crnum=97951 csiz=36869171 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=70001 fsiz=58203816 crnum=87951 csiz=33508862 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=80001 fsiz=61974918 crnum=77951 csiz=28867366 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=90001 fsiz=64163782 crnum=67951 csiz=25698000 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=100001 fsiz=66314530 crnum=57951 csiz=22858433 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=110001 fsiz=69521776 crnum=47951 csiz=18789339 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=120001 fsiz=71238559 crnum=37951 csiz=16151196 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=130001 fsiz=73565534 crnum=27951 csiz=12885585 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=140001 fsiz=75759457 crnum=17951 csiz=9718694 dknum=0
estcmd: INFO: flushing index words: name=searchX dnum=2924 wnum=150001 fsiz=80626768 crnum=7951 csiz=3832485 dknum=0
estcmd: INFO: closing: name=searchX dnum=2924 wnum=157952 fsiz=83841343 crnum=0 csiz=0 dknum=0
estcmd: INFO: finished successfully: elapsed time: 0h 2m 14s

estcmd extkeys searchX
estcmd optimize searchX
estcmd purge -cl searchX

cd ~/Mail
estcmd gather -cm -sd -fm -lf 4 -bc searchMail . 
estcmd extkeys searchMail
estcmd optimize searchMail
estcmd purge -cl searchMail

# Create a temporary file and save its name in "$tmp"
tmp=$(mktemp "/tmp/searchXXXXXX")
# Construct the index name
index="search${PWD##*/}"
# Find all plain files that are non-empty; ignore the index subdirectory
find . -wholename "./$index" -prune -o -type f -size +0 -fprint "$tmp"

# Build the index using the collected filenames; skip files over 4MB
estcmd gather -sd -fm -lf 4 "$index" "$tmp"
estcmd extkeys "$index"
estcmd optimize "$index"
estcmd purge -cl "$index"

# Remove the temp file
rm "$tmp"

cd ~/Docs/MealMaster
estcmd gather -cm -sd -ft searchMealMaster . 
estcmd extkeys searchMealMaster
estcmd optimize searchMealMaster
estcmd purge -cl searchMealMaster

#!/bin/bash
# Created by Ben Okopnik on Sat Jan  3 00:50:54 EST 2009
# Some ideas from Karl Vogel's Hyperestraier article
# (http://linuxgazette.net/158/vogel.html)

# Maximum file size in MB; adjust this to your preferences 
MAX_FILE_SIZE=3

dir="$(pwd)"
db="$dir/search${dir##*/}"

# Default options for "gather":
#	-cl: Regions of overwritten documents will be cleaned up
#	-ft: Files will be treated as plain text
#	-bc: Binary files will be detected and ignored
#	-sd: Modification date of each file will be recorded as an attribute
#	-cm: Documents whose modification date has not changed will be ignored
#	-lf N: Ignore any documents larger than N megabytes
gather_opts="-cl -ft -bc -sd -cm -lf $MAX_FILE_SIZE"

# Define file extensions to ignore; this saves us time, since we don't need
# to run "file" over them. This list does not include "questionable"
# filetypes (i.e., DOC, PDF, etc.) that you may want to delegate and index later.
ignore="$db|\.(gif|jpg|png|xcf|gz|tgz|tbz|p[pb]m|tiff?|mp[23g]|mpeg|wav|midi?|sid|au|r[am]|[au]law|xbm|pag|dir|swp|idx|psd|xls|sxw|zip|pgm|wm[av]|eps|swf|aux|bbl|idx|tex|raw|od[st])$"

/bin/echo "========= Searching for indexable content ============="

# If there's no EXCLUDE file, create one that just excludes itself
# (needed by the 'egrep -ivf EXCLUDE' filter.)
[ ! -f "$dir/EXCLUDE" ] && echo '/EXCLUDE$' > "$dir/EXCLUDE"

# Ignore the Hyperestraier index and any empty or "non-regular" files
/usr/bin/find . -wholename "$db" -prune -o -type f -size +0|\
	# Generate 'file' output for each file, ignoring weirdness in filenames
	/usr/bin/xargs -d '\n' -I '{}' -s 1000 file -F '///' '{}'|\
	# Ignore these (false positives for "text" filetype)
	/bin/egrep -iv '///.*(latex|rich)'|\
	# Ignore everything _except_ these filetypes (positive matches); return fileNAMES
	/bin/sed -n 's#^\(.*\)///.*\(text\|xml\|pod_doc\).*$#\1#;T;p'|\
	# Exclude any filenames that match patterns in the 'EXCLUDE' file
	/bin/egrep -ivf './EXCLUDE'|\
	# Exclude filenames that match the 'ignore' pattern
	/bin/egrep -iv "$ignore"|\
	# Index the remaining results
	/usr/bin/estcmd gather $gather_opts "$db" -

# Remove the 'spurious' EXCLUDE file
[ "`/bin/cat $dir/EXCLUDE`" = '/EXCLUDE$' ] && rm "$dir/EXCLUDE"

/bin/echo "================== Optimizing... ======================"
/usr/bin/estcmd extkeys "$db"
/bin/sleep 1
/usr/bin/estcmd optimize "$db"
/usr/bin/estcmd purge -cl "$db"
/bin/echo "==================== Finished. ========================"

#!/bin/bash
# Created by Ben Okopnik on Thu Jan 15 23:41:56 CST 2009

WEBROOT="/var/www"

dir="$(pwd)"
db="$dir/search${dir##*/}"

# Exit if there's no index database in the current directory
[ -d "$db" ] ||  { printf "$db not found - exiting...\n"; exit 1; }

sdir="$WEBROOT/search/${dir##*/}"
# Exit if the search directory with the proposed name already exists
[ -d "$sdir" ] && { printf "$sdir already exists - exiting...\n"; exit 1; }

# Create the ".source" dir if it doesn't already exist and copy the key
# files into it
[ -d $WEBROOT/search/.source ] || {
	mkdir -p "$WEBROOT/search/.source"
	cp /usr/share/hyperestraier/estseek* $WEBROOT/search/.source
	cp /usr/lib/estraier/estseek.cgi $WEBROOT/search/.source
}

mkdir -p "$sdir"
cd "$sdir"
DB="$db" /usr/bin/perl -wpe's/^(indexname:).*$/$1 $ENV{DB}/' ../.source/estseek.conf > estseek.conf
ln -s ../.source/estseek.{cgi,help,tmpl,top} . 

#!/usr/bin/perl -wT
# Created by Ben Okopnik on Thu Jan 15 22:11:38 CST 2009
use strict;
use CGI qw/:standard/;
$|++;

my @dirs;
while (<*>){ push @dirs, $_ if -d; }

binmode STDOUT, ':encoding(UTF-8)';		# Set up utf-8 output
print header( -charset => 'utf-8' ), 
	start_html( -encoding => 'utf-8', -title => 'Available searches' ),
	h3('Available search indexes'),
	map( { a( { -href=>"$_/estseek.cgi" }, $_ ), br, "\n"} @dirs ),
	end_html;

user_pref("capability.policy.localfilelinks.checkloaduri.enabled", "allAccess");
user_pref("capability.policy.localfilelinks.sites", "http://localhost http://127.0.0.1");
user_pref("capability.policy.policynames", "localfilelinks");

text/plain					com

Home Main Site FAQ Site Map Mirrors Translations Search Archives Authors Mailing Lists Join Us! Contact Us
The Free International Online Linux Monthly	ISSN: 1934-371X	Main site: http://linuxgazette.net

-cm	index only the files where the mtime has changed
-sd	record the mtime of each file as an "attribute" (searchable data)
-fm	treat all files as MIME (note that without this option, most of your email files would be ignored - e.g., files ending in ".com" would be rejected as DOS executables!)
-lf 4	don't index files over 4MB in size
-bc	ignore binary files (there shouldn't be any, but might as well check)
.	read the files from the current directory

Hyperestraier Redux - A User-friendly Approach

Personal Quirks Warning

The Basics

Decisions, Decisions

The Mission Moves into its Execute Phase

Re-indexing

So, the result is...

Indexing Other Stuff

On To Other And Bigger Challenges

The Big Challenge

Browsing the Results

"Awesome! I'm going to use my Mozilla browser and pretend that I own Google!!!"

Wrap-up

Should I save this or open it with some random application? Maybe I should just execute it and try to infect your machine... oh, darn, this is Linux. Can I just explode your monitor, then? Are you sure? Just a little bit?
	Destroy		Smash