#DIRECTORIES=20news-bydate-test/talk.politics.mideast/ # Use this to test the Makefile on a small portion of the data DIRECTORIES=$(wildcard */*/) # All directories SOURCES=$(wildcard $(patsubst %,%????, $(DIRECTORIES))) $(wildcard $(patsubst %,%?????, $(DIRECTORIES))) $(wildcard $(patsubst %,%??????, $(DIRECTORIES))) LISTS=$(patsubst %,%.list, $(SOURCES)) # The corresponding .list files FEATURES=$(patsubst %,%.features, $(SOURCES)) # The corresponding .features files all: dictionary.txt $(FEATURES) show: @echo 'Directories:' @echo $(DIRECTORIES) @echo 'Sources:' @echo $(SOURCES) @echo 'Lists:' @echo $(LISTS) clean: rm -f $(LISTS) $(FEATURES) words.txt word_counts.txt %.list: % text_to_word_list.pl cat $< | ./text_to_word_list.pl | sort --unique > $@ %.features: %.list words.txt # Translate word lists into word id lists join -1 1 -2 2 -o 2.1 $< words.txt > $@ words.txt: word_counts.txt # Create alphabetically sorted dictionary: word_id word cat word_counts.txt | cut --bytes=9- | nl | sort -k 2,2 > $@ word_counts.txt: $(LISTS) # List of: count word in decreasing order of count sort $(LISTS) --merge | uniq --count | sort --numeric-sort -r > word_counts.txt dictionary.txt: words.txt cat word_counts.txt | cut --bytes=9- > $@ .PHONY: clean show all