Today at work I had to take a whole set of directory upon directory of files and convert them from one character encoding to UTF-8. Of course, this was instantly a task I said a script should do. And along came the following script. It takes two options one required, defining the character encoding of the files being converted, using the flag “-f”. The other optional, defining the character encoding the files are to be converted to, using the flag “-t”. The second is optional because the default is to convert to is UTF-8. Finally, it accepts an argument list of directories or file names and processes each recurcively going through each directory processing each file and/or directory in it. It ignores all invisible files (those that start with “.”). It will most likely fail for a given file if the character encoding is off. But it will continue with the other files notifing the user that there was an issue.
require 'optparse'
require 'ostruct'
require 'iconv'
SCRIPT_NAME = "File Character Encoding Converter"
SCRIPT_VERSION = "0.1"
class FileEncodingConverterOptionParser
def self.parse(args)
options = OpenStruct.new
options.to = 'UTF-8'
options.verbose = false
opts = OptionParser.new do |opts|
opts.banner = "Usage: convert_encoding.rb [options] directory|file..."
opts.separator ""
opts.separator "Specific options:"
opts.on("-f", "--from ENCODING", "Character Encoding converting from") do |encoding|
options.from = encoding
end
opts.on("-t", "--to ENCODING", "Character Encoding converting to") do |encoding|
options.to = encoding
end
opts.separator ""
opts.separator "Common options:"
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
options.verbose = v
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
opts.on_tail("-V", "--version", "Show version") do
puts SCRIPT_NAME + ' ' + SCRIPT_VERSION
exit
end
end
opts.parse!(args)
options
end
end
options = FileEncodingConverterOptionParser.parse(ARGV)
VERBOSE = options.verbose ? true : false
class FileEncodingConverter
def initialize(from, to = 'UTF-8')
@to = to
@from = from
end
def fconv(filename)
begin
contents = File.open(filename).read
output = Iconv.conv(@to, @from, contents)
file = File.open(filename, 'w')
file.write(output)
rescue Iconv::IllegalSequence
puts "Could not be processed: #{filename}"
else
if VERBOSE
puts "Processed successfully: #{filename}"
end
ensure
if defined? file
if file
file.close
file = nil
end
end
end
end
def process_dir(dir)
Dir.foreach(dir){ |filename|
if !filename.match(/^\./)
absolute_path = dir + '/' + filename
if File.file?(absolute_path)
self.fconv(absolute_path)
elsif File.directory?(absolute_path)
self.process_dir(absolute_path)
end
end
}
end
end
converter = FileEncodingConverter.new(options.from, options.to)
ARGV.each { |arg|
wd = Dir.getwd
entryname = arg.match(/^\//) ? wd + '/' + arg : arg
if File.directory?(entryname)
converter.process_dir(entryname)
elsif File.file?(entryname)
converter.fconv(entryname)
end
}
Tags: ruby, iconv, utf8, encodings, scripting