#!/usr/bin/ruby

#
# Copyright (C) 2010 Sebastiano Vigna 
#
#  This script is free software; you can redistribute it and/or modify it
#  under the terms of the GNU Lesser General Public License as published by the Free
#  Software Foundation; either version 2.1 of the License, or (at your option)
#  any later version.
#
#  This script is distributed in the hope that it will be useful, but
#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
#  for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

if ARGV.size < 2 then
	puts(__FILE__ + " " + <<eof
COAUTHORSHIP PUBCOUNT [VOTING]
Reads a standard dblp.xml file from standard input and
writes on standard output lines containing a publication number,
an author name and the publication DBLP key. COAUTHORSHIP will contain
pairs of authors (in lexicographical order) followed by the number of
publications in common. PUBCOUNT will contain a lexicographically sorted
list of the authors together with their number of publications. Fields
are always separated by a TAB. The voting file, when specified,
contains pairs of authors that determine a delegation graph (according
to the election rules specified in the paper "Voting in social networks").
eof
	)
	exit
end

require 'rubygems'
require 'htmlentities'
require 'rexml/document'
require "rexml/streamlistener"
include REXML

coauthorship = ARGV[0]
pubcount = ARGV[1]
if ARGV.size > 2
	voting = ARGV[2]
else
	voting = nil
end

class Handler
	include StreamListener

	attr_accessor :coauth, :count

	def initialize()
		@coder = HTMLEntities.new
		# A map indexed by author names containing the number of publications.
		@count = {}
		@count.default = 0
		# The accepted XML elements.
		@accepted = { "book" => true, "article" => true, "incollection" => true, "inproceedings" => true }
		# Whether we are inside an entry.
		@inside = nil
		# Whether we are inside an author.
		@inside_author = false
		# A sequential number attributed to entries.
		@pubb = -1
		# A map indexed by author names containing hashes, again indexed by author names.
		# For each coauthor, the hash contains the number of common publications.
		@coauth = {}
		# The set of authors in the current entry.
		@curr = {}
		# The DBLP key of the current entry.
		@key = nil
	end
	
		def tag_start name, attrs
			if @accepted[name] then
				# New publication
				@pubb += 1
				@curr.clear				
				@key = attrs["key"]
				@inside = name
			elsif name == "author" and @inside != nil then
				@author = ""
				@inside_author = true
			end
		end

		def tag_end name
			if @accepted[name] then
				if name == @inside then
					# End of publication: enumerates all distinct pairs of authors in the same paper and updates coauth acordingly.
					@curr.keys.each do |a|
						if @coauth[a] == nil then 
							@coauth[a] = {}
							@coauth[a].default = 0
						end
						@curr.keys.each do |b|
							if a != b then
								@coauth[a][b] += 1
							end
						end
					end
				end
				@inside = nil
			elsif name == "author" and @inside != nil then
				@count[@author] += 1
				@curr[@author] = true
				printf("%d\t%s\t%s\n", @pubb, @author, @key )
				@inside_author = false
			end
	   end

		def text text
			if @inside_author then @author += @coder.decode(text); end
		end
	end


Document.parse_stream($stdin, handler = Handler.new)
count = handler.count
coauth = handler.coauth

# Compute a lexicographically sorted array of authors, and a hash mapping each author to its lexicographical rank.

sorted_authors = count.keys.to_a.sort!
i = 0
author_map = {}
sorted_authors.each do |a|
	author_map[a] = i
	i += 1
end

# For each author/coauthor pair (in lexicographical order) writes the number of common publications.

f = File.new(coauthorship, File::CREAT|File::TRUNC|File::WRONLY)
sorted_authors.each do |a|
	h = coauth[a]
	if h != nil then
		h.keys.sort.each do |coa|
			f.syswrite(sprintf("%s\t%s\t%d\n", author_map[a], author_map[coa], h[coa]))
		end
	end
end
f.close

# For each author (in lexicographical order) writes the number of publications.

f = File.new(pubcount, File::CREAT|File::TRUNC|File::WRONLY)
sorted_authors.each do |a|
	c = count[a]
	f.syswrite(sprintf("%s\t%d\n", a, c))
end
f.close

# Write the arcs of the voting graph. Each author votes for the coauthor who has
# more publications than him that has more common publications. Arcs are written
# in it.unimi.dsi.webgraph.ArcListASCIIGraph format, with nodes numbered
# starting from 0.

if voting then
	f = File.new(voting, File::CREAT|File::TRUNC|File::WRONLY)
	sorted_authors.each do |a|
		h = coauth[a]
		found = false
		if h != nil then
			keys = h.keys.to_a
			keys.sort! { |x,y| h[y] <=> h[x] }
			keys.each do |coa|
				if count[coa] > count[a] then 
					f.syswrite(sprintf("%d\t%d\n", author_map[a], author_map[coa]))
					found = true
					break;
				 end
			end
		end

		if not found then f.syswrite(sprintf("%s\t%s\n", author_map[a], author_map[a])); end
	end
	f.close
end
