/*
 * $Id: RSSWebNavigator.java,v 1.11 2005/12/27 16:55:10 rampil Exp $
 * Copyright (c) 2005 LOGICAL-PARADOX.ORG
 */
package org.logical_paradox.rss.robot;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.rmi.RemoteException;
import java.util.Calendar;
import java.util.Hashtable;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.logical_paradox.common.net.IPAddressRangeSet;
import org.logical_paradox.common.net.IPAddressRangeSetFactory;
import org.logical_paradox.rss.IllegalObjectStateException;
import org.logical_paradox.rss.RSSConstant;
import org.logical_paradox.rss.RSSIllegalConfigException;
import org.logical_paradox.rss.RSSProperties;
import org.logical_paradox.rss.dsr.RSSDistributedServiceRegistry;
import org.logical_paradox.rss.dsync.SyncQueueException;
import org.logical_paradox.rss.http.HREFCollector;
import org.logical_paradox.rss.http.HTMLContentsFilter;
import org.logical_paradox.rss.http.NoIndexException;
import org.logical_paradox.rss.http.Site;
import org.logical_paradox.rss.http.URLPool;
import org.logical_paradox.rss.http.URLSuspender;
import org.logical_paradox.rss.http.WebContents;
import org.logical_paradox.rss.lcmgr.RSSLocalContentsMgr;
import org.logical_paradox.rss.lookup.Lookup;
import org.logical_paradox.rss.lookup.LookupFactory;
import org.logical_paradox.rss.robot.event.SNodeDSyncEventListener;
import org.logical_paradox.rss.router.RNodeClientFactory;
import org.logical_paradox.rss.router.RNodeDSyncMngr;
import org.logical_paradox.rss.router.RoutingNodeClient;
import org.logical_paradox.rss.router.SiteLock;
import org.logical_paradox.rss.router.algorithm.RoutingAlgorithm;
import org.logical_paradox.rss.util.RandomKeyGenerator;

/**
 * RSSWebNavigator
 * RSS Web񃍃{bgɒTׂiH肷NX
 * KvɉāCiH}X^[̃T[o[ƂƂ肵C̐iH񎦂
 * @author satoshi akabane@logical-paradox.org
 * @version $Revision: 1.11 $
 */
public class RSSWebNavigator {
	/** K[ */
	private static final Log log = LogFactory.getLog(RSSWebNavigator.class);

	/** ftHg̐ݒt@C */
	public static String NAVIGATIOR_CONF_FILENAME = "../conf/rssnavigator.conf";

	/** RSSWebNavigatorRtBO[V */
	private RSSNavigatorConfig cfg = null;

	/** URL */
	private URLPool urls = null;
	/** ݃g[XURL */
	private int nowFetchCnt = 0;
	/** g[X\URL̍ő */
	private int maxFetchCnt = 0;
	/** oHASY */
	private RoutingAlgorithm algo = null;
	/** bNۗLĂTCg */
	private Hashtable sites = null;

	/** oHm[h̃X^u */
	private RNodeClientFactory rnode = null;
	/** oHNCAg */
	private RoutingNodeClient rnClient = null;
	/** 󂯎Tm[hID */
	private String robotId = null;
	/** ɕۗLł郍bN̐ */
	private int limKeepLocks = 0;
	/** T}~ΏۂURLǗIuWFNg */
	private URLSuspender suspender = null;
	/** ŌɒTbNǗIuWFNgɑ΂GCN */
	private long siteLockLastGCTime = 0;

	/** ~tO */
	private boolean shutdownFlg = false;

	/** irQ[V[h(true:UVXe false:X^hA[) */
	private boolean navimode = true;
	/** [JRecǗVXe */
	private RSSLocalContentsMgr lcm = null;

	/** oHm[hƂ̊ԂŃbZ[ŴƂxf[ */
	private RNodeDSyncMngr dsyncman;
	/** URLGR[̏ */
	protected boolean flg_EchoURL = true;


	/**
	 * RXgN^
	 */
	protected RSSWebNavigator() {
		urls = new URLPool();
		sites = new Hashtable();
		suspender = new URLSuspender();
	}

	/**
	 * oHirQ[^[̃CX^XԂ
	 * @return oHirQ[^[
	 * @throws IllegalObjectStateException oHm[hƂ̐ڑɎs
	 * @throws RSSIllegalConfigException oHirQ[^[̐ݒt@Cs
	 * @throws IOException ݒt@C̓ǂݍ݂Ɏs
	 */
	public static RSSWebNavigator getNavigator()
					throws IllegalObjectStateException, RSSIllegalConfigException, IOException {

		RSSWebNavigator n = new RSSWebNavigator();

		// oHirQ[^p̃RtBO[h
		n.cfg = new RSSNavigatorConfig(NAVIGATIOR_CONF_FILENAME);
		n.navimode = n.cfg.getNavigationMode();

		log.info("RSSTVXe" + (n.navimode==true?"UVXe":"X^hA[VXe") + "ƂĐݒ肵܂");

		n.maxFetchCnt = n.cfg.getDebugTraceURLs();
		n.flg_EchoURL = n.cfg.getEchoURL();
		n.limKeepLocks = n.cfg.getKeepLocks();
		n.siteLockLastGCTime = Calendar.getInstance().getTimeInMillis();

		n.algo = n.cfg.getRoutingAlgorithm();

		log.info("oHASY'" + n.algo.getAlgorithmName() + "'Ɏw肳܂");

		try {
			if(n.navimode == false) {
				// TJnURLo^鏈
				String root = n.cfg.getRoot();
				if(root == null || root.trim().length() == 0) {
					// 'ROOT'̍s݂Ȃꍇ
					throw new RSSIllegalConfigException();
				}

				n.urls.add(root);

				// X^hA[̏ꍇ͒Tm[hIDĂȂ̂ŁCŏɍ쐬
				n.robotId = "SNODE:" + RandomKeyGenerator.getUniqKey();
			} else {
				// oHm[hTJnʒuႤꍇ
				// ܂oHm[hփT[rXC
				// oHm[hICڑp̃NCAgX^u擾
				log.info("oHm[hɃT[rXCĂ܂");

				Lookup lookup = LookupFactory.getLookup(n.cfg.getDistServRegURL());
				n.rnode = (RNodeClientFactory)lookup.lookup(RSSDistributedServiceRegistry.RSS_SERVID_ROUTING_NODE_CLIENT_FACTORY);
				n.rnClient = n.rnode.bindSearchNode();

				n.robotId = n.rnClient.getNodeId();

				log.info("Tm[hoHm[h[" + n.rnode.getNodeId() +"]ɃoCh܂");
				log.info("Tm[hID: " + n.robotId);

				// IPAhX͈̓Zbg𐶐
				IPAddressRangeSet rangeSet = null;

				String httpStubModeString = RSSProperties.getString(RSSConstant.RSS_PKEY_STUB_HTTP_CONNECTION);
				if(httpStubModeString != null && Boolean.valueOf(httpStubModeString).booleanValue() == true) {
					log.info("eXgڑ[ĥ߁CIP͈͂𐧌܂");
				} else {
					rangeSet = IPAddressRangeSetFactory.getIPAddressRangeSet();
					rangeSet.setRanges(n.rnClient.getAllowedRoutingIPRanges());
				}

				// f[X^[g
				n.dsyncman = new RNodeDSyncMngr(n.cfg, n.rnClient, 1000, n.urls, rangeSet);
				n.dsyncman.addListener(new SNodeDSyncEventListener(n));
				n.dsyncman.start();
			}
		} catch(MalformedURLException me) {
			log.info("oHm[hIT[o[URL");
			throw new IllegalObjectStateException(me.getMessage());
		} catch(RemoteException re) {
			log.info("oHm[hƂ̒ʐMɎs܂");
			throw new IllegalObjectStateException(re.getMessage());
		} catch(Exception e) {
			log.info("̑̌ɂG[ : " + e.getMessage());
			e.printStackTrace();
			throw new IllegalObjectStateException(e.getMessage());
		}

		return n;
	}


	/**
	 * [JRecǗVXeݒ肷
	 * ݒ肵ȂꍇC[J̃RecǗVXe͌Ă΂Ȃ<br>
	 * @param mode true:[JRecǗVXeJn / false:~
	 * @throws RSSIllegalConfigException [JRecǗVXȅɎs
	 */
	public void enableLocalContentsMgr(boolean mode) throws RSSIllegalConfigException {
		if(mode == true) {
			// [JRecǗVXeLɂꍇ
			if(lcm == null) {
				// ܂[JRecǗVXeĂȂꍇ
				// 
				lcm = new RSSLocalContentsMgr(navimode);
			}
			// [JRecǗVXẽXCb`
			// ɂāCXbhJn
			lcm.start();
		} else if(lcm != null) {
			// [JRecǗVXe~
			lcm.done();
		}
	}

	/**
	 * wURL̒TbNĂ邩ǂׂ
	 * bNĂȂꍇ́CbN擾悤Ǝ݂<br>
	 * ʂƂĂǂĂbN擾łȂꍇfalseԂ<br>
	 * ɃbNĂ邩CbN擾邱ƂoꍇtrueԂ
	 * @param url bNΏURL
	 * @return true:bN擾ł / false:łȂ
	 */
	public boolean isLocked(String url) {
		boolean rc = true;
		String siteName = getSiteNameByURL(url);
		if(siteName == null) {
			return false;
		}

		// ɏ\Ȑ̃bNۗLĂꍇCoHm[h֓]
		if(dsyncman.findLock(siteName, getRobotId()) == null && dsyncman.numOfMyLocks(getRobotId()) >= limKeepLocks) {
			try {
				dsyncman.poolURL(url);
			} catch(SyncQueueException se) {
			}
			return false;
		}

		// w肳ꂽTCgɑ΂郍bNmFC󋵂ɂĂ͎擾\Ă݂
		// ʂƂāCbNێĂȂC邢͎擾łȂꍇ͏I
		if(requireSiteLock(siteName, getRobotId()) == false) {
			log.info(url + "bNł܂ł");
			return false;
		}

		return rc;
	}

	/**
	 * w肳ꂽURLTĂǂǂ𒲂ׂ
	 * ̒ł́Crobots.txtQƂ邾<br>
	 * META^Ow肳Ă̂ɂẮCRecliKŔ肷
	 * @param url ΏURL
	 * @return true:TOK(robots.txtŋĂ) / false:T֎~
	 */
	public boolean isAllowed(String url) {
		boolean rc = false;

		String siteName = getSiteNameByURL(url);
		if(siteName == null) {
			return false;
		}

		// TCgIuWFNg̍\z
		synchronized(sites) {
			Site so = (Site)sites.get(siteName);
			if(so == null) {
				try {
					log.trace("TCgIuWFNg쐬Ă܂");
					so = Site.getInstance(url, cfg.getProperty("HTTP_USER_AGENT"));
					sites.put(siteName, so);

				} catch(IllegalArgumentException e) {
					// TCgIuWFNg𐶐邱ƂłȂ̂ŁCƂ肠
					// _ƔfĂ .. ɂ܂
					return false;
				}
			}

			if(so.hasRule() == false || so.isAllowed(url, cfg.getHttpUserAgent()) == true) {
				// robots.txt݂ȂCƂĂĂꍇ
				rc = true;
			} else {
				// robots.txt݂āC֎~Ă̈̏ꍇ
				rc = false;
			}
		}

		log.trace(url + "́C" + (rc==true?"":"֎~") + "Ă܂");
		return rc;
	}

	/**
	 * w肳ꂽTCgɑ΂郍bNmF/\
	 * bNێĂC邢͐\̌ʂƂĎ擾łꍇtrue<br>
@	 * Ƃɂ̒Tm[hł́CƂ̏oȂTCg̏ꍇfalseԂ
	 * @param dn hC
	 * @param rid Tm[hID
	 * @return true:bN̎擾ɐ / false:s
	 */
	protected boolean requireSiteLock(String dn, String rid) {
		// ܂[J̃bNǗIuWFNgmF
		if(dsyncman.findLock(dn, rid) != null) {
			return true;
		}

		// bN\Ă݂
		boolean rc = true;
		SiteLock l = new SiteLock(dn, rid);
		try {
			rc = dsyncman.lock(l);
		} catch(SyncQueueException se) {
			rc = false;
		}

		return rc;
	}

	/**
	 * TCgTASYݒ肷
	 * @param r TASY
	 */
	public void setRoutingAlgorithm(RoutingAlgorithm r) {
		algo = r;
	}

	/**
	 * Tm[hIDԂ
	 * @return Tm[hID
	 */
	public String getRobotId() {
		return robotId;
	}

	/**
	 * irQ[^N[Y
	 * ̏sƁCoHm[hTm[h؂藣
	 * @throws RemoteException ؒfɎs
	 */
	public void close() throws RemoteException {
		// [JRecǗVXe̒~
		// ]̃RecSăRecǗVXe֓]
		if(lcm != null) {
			log.info("[JRecǗVXe~Ă܂");
			try {
				enableLocalContentsMgr(false);
				log.info("[JRecǗVXe~܂");
			} catch(Exception e) {
				log.error("~ɗO܂:\n" + e.getMessage());
			}
		}

		if(navimode == RSSNavigatorConfig.NAVIMODE_STANDALONE) {
			// P̂œ삵Ăꍇ͉ȂŕԂ
			return;
		}

		log.info("oHm[hTm[h'" + getRobotId() + "'؂藣Ă܂");

		if(rnode == null) {
			throw new RemoteException("navigator has been already closed");
		}

		// ؂藣
		rnode.unbindSearchNode(getRobotId());

		log.info("'" + getRobotId() + "'́CoHm[h؂藣܂");

		rnode = null;
		rnClient = null;
	}

	/**
	 * ŤʁCꂽURLo^
	 * Kvɉă}X^[̃T[o[֓]
	 * @param c WebRec
	 * @throws IllegalObjectStateException [JRecǗVXeŔO
	 * @throws MalformedURLException URL̃tH[}bgs
	 * @throws IOException [JRecǗVXeŔO
	 */
	public void sendContents(WebContents c)
									throws IllegalObjectStateException, MalformedURLException, IOException {

		boolean needToStore = true;

		// 莞Ԃo߂ĂꍇCTbN̂߂GCN
		if(Calendar.getInstance().getTimeInMillis() - siteLockLastGCTime >= cfg.getSiteLockExpire()) {
			log.trace("TbÑKx[WRN^N܂");
			SiteLock[] removedLockObjs = dsyncman.gc();
			siteLockLastGCTime = Calendar.getInstance().getTimeInMillis();
		}

		if(c == null) {
			// WebRec܂܂ĂȂꍇ́CȂŏ߂
			return;
		}

		if(c.getURL().toString().indexOf("robots.txt") >= 0) {
			// ŤʁC擾y[Wrobots.txtꍇ
			releaseSearchLock(c);
			// ȏ͕̏KvȂ
			return;
		}

		if(c.getWebContents() == null) {
			// RecȂ̂ŁĈ܂ܕԂ
			return;
		}

		// AJ[^O̎W
		Vector anchors = null;
		try {
			anchors = HREFCollector.collect(c.getWebContents());
		} catch(NoIndexException noe) {
			// Yy[Windexing֎~Ăꍇ
			if(noe.isFollowable() == true) {
				anchors = noe.getAnchorsInThisPage();		// null̏ꍇ̂Œ

				// webRec𖳌鏈
				needToStore = false;
			}
		}
		Vector an = HREFCollector.getAnchorsAsABSPath(c.getURL(), anchors);
		if(an == null) {
			return;
		}

		int receives = maxFetchCnt == 0 || (an.size() + nowFetchCnt) < maxFetchCnt ? an.size() : maxFetchCnt - nowFetchCnt;

		int nowURLs = urls.size();

		try {
			for(int i = 0; i < receives; i++) {
				if(dsyncman != null && nowURLs >= cfg.getKeepUrls() && navimode == RSSNavigatorConfig.NAVIMODE_DISTRIB) {
					// ȏURLۊǂłȂꍇ͌oHm[h֓]邽߂Ƀv[
					// CUVXeƂē삵Ăꍇ̂݁D]͓f[ɂ
					dsyncman.poolURL((String)an.elementAt(i));
				} else {
					// ]͂̂ŁCTm[hŏ
					urls.add((String)an.elementAt(i));
					nowURLs++;
				}
			}
		} catch(SyncQueueException se) {
		}

		int beforeCnt = nowFetchCnt / 1000;
		nowFetchCnt += receives;
		int afterCnt = nowFetchCnt / 1000;

		// HTMLv[eLXgɕϊtB^̒ǉ
		c.addContentsFilter(new HTMLContentsFilter());

		// RecǗVXeɑ΂āCRec𑗏o
		// RecindexingĂ̂ɂĂ̂
		if(lcm != null && needToStore == true) {
			synchronized(lcm) {
				while(lcm.isReceivable() == false) {
					try {
						lcm.wait();
					} catch(InterruptedException e) {}
				}
			}

			lcm.store(c);
		}
	}


	/**
	 * T}Ԃ畜A
	 * robots.txt擾łꍇCTCgIuWFNgɑ΂ă[ݒ肷
	 * @param c WebRec
	 * @throws IllegalObjectStateException TCgbN̐
	 */
	public void releaseSearchLock(WebContents c) throws IllegalObjectStateException {
		// f[^ɂSiteIuWFNgXV(y[W擾[ݒ肷)
		String dn = getSiteNameByURL(c.getURL().toString());
		if(dn == null) {
			return;
		}

		Site s = (Site)sites.get(dn);
		if(s == null) {
			// OɃTCgIuWFNgĂȂ̂ŁC
			// eXg[h̏ꍇɂ悭̂ŁCƂ肠ƂĂ͌p
			log.warn("TCgIuWFNgȂ̂Ŗ܂");
			return;
		}
		String webContents = c.getWebContents();
		if(webContents == null || webContents.trim().length() == 0) {
			// Rec܂łȂꍇ(404̏ꍇ)͉Ȃ
			log.trace("URL: " + c.getURL().toString() + "݂͑܂ł");
		} else {
			// robots.txtŁC炩̃f[^܂łꍇ͓WJ
			ByteArrayInputStream in = new ByteArrayInputStream(webContents.getBytes());
			try {
				s.getRuleFromTheSite(in);
				log.trace("TCg'" + dn + "'̃[ݒ肵܂");
			} catch(IOException ie) {
				log.error("robots.txt̉͂Ɏs");
			}
		}

		// robots.txtۂɂǂɊւ炸CŒT}Ԃ畜A
		try {
			sites.put(dn, s);
			String[] releasedURLs = suspender.release(dn);
			if(releasedURLs != null) {
				// ꂽURL݂̂ŁCSL[ɖ߂
				pushbackURLs(releasedURLs);
			}
			log.trace("TCg'" + dn + "'̒T}Ԃ܂");
		} catch(IllegalArgumentException ie) {
			log.trace("TCg'" + dn + "'̉Ɏs܂", ie);
		}
	}

	/**
	 * ɒTׂURL擾
	 * oURL͎IɃXg폜
	 * @return ɒTׂURL
	 */
	public String getNextURL() {
		String rc = null;

		while((rc = algo.nextURL(urls)) != null) {
			// oꂽURLC擾Ă邩ǂ𒲂ׂ
			// ĂȂꍇCURL͎̂Ă
			try {
				if(suspender.isSuspended(rc) == true) {
					// T}ĂURLȂ̂ŁCT}IuWFNgɓo^(L[͍폜邪CT}ꂽiKōē)
					String dn = getSiteNameByURL(rc);
					if(dn != null) {
						suspender.suspend(Site.getInstance(dn, cfg.getHttpUserAgent()), rc);
					}
				}
			} catch(IllegalArgumentException e) {
				// URL̂Ŏ̂Ă
				// ȂƎ̂ĂƂɂȂ
				continue;
			}

			// oꂽURLɑΉSiteIuWFNg݂邩ǂ𒲂ׂ
			String dn = getSiteNameByURL(rc);
			if(dn == null) {
				// URL̂Ŏ̂Ă
				continue;
			}

			boolean robotsfg = false;

			// bN̎擾
			if(isLocked(rc) == true) {

				// bN擾łC邢͍ŏ珊Ăꍇ
				if(sites.get(dn) == null) {

					// TCgIuWFNg݂Ȃꍇrobots.txtURLƂĕԂ
					Site s = Site.getInstance(rc, cfg.getHttpUserAgent());

					// ɁCTCgIuWFNgT}ǗIuWFNgɓo^
					suspender.suspend(s, rc);

					// TCrobots.txtQƂN_ɂȂURL폜
					// URLƂōēxT邽߂ɕKvɂȂ
					algo.removeFromHistory(rc);

					// Vrobots.txtURL쐬ĕԂ
					rc = s.getRobotsTxtURL();

					// ƁCTCgIuWFNgo^
					sites.put(dn, s);
					break;
				} else if(isAllowed(rc) == true) {
					// TCgIuWFNg݂ꍇŁCTĂꍇ
					// Ōł悢̂ŁC[vEo
					break;
				}
			} else {
				// bN擾łȂ̂ŁCURL͂ȂƂɂ
				// ȂƎ̂ĂƂɂȂ
				// Kvł΁Cɕʂ̒Tm[hւ̓]Ăǂ
			}
		}

		return rc;
	}

	/**
	 * URLvbVobN
	 * @param u vbVobNURL̔z
	 */
	private void pushbackURLs(String[] u) {
		if(u == null) {
			return;
		}

		for(int i = 0; i < u.length; i++) {
			try {
				urls.add(u[i]);
			} catch(MalformedURLException ue) {
				// ǉ悤ƂURLɉ肪ꍇ͖
			}
		}
	}

	/**
	 * URL̎cʂԂ
	 * @return URL̎c
	 */
	public int getQueueSize() {
		return urls.size();
	}

	/**
	 * URL0̊Ԃ̓Xbh~
	 */
	public void waitUntilEmpty() {
		urls.waitUntilEmpty();
	}

	/**
	 * ̌oHirQ[^ɋ@\~Ă邩ǂԂ
	 * w肳ꂽURLi[CSă{bgTꍇ͒~ԂɂȂ
	 * @return true:oHirQ[^͒~\ / false:s
	 */
	public boolean isFinished() {
		if(shutdownFlg == true) {
			return true;
		}
		if(countSuspendedSites() + urls.size() == 0 && maxFetchCnt > 0 && nowFetchCnt >= maxFetchCnt) {
			// T}URLƃL[ۂŁCi[lݒ肳ĂāC̐ꍇ
			return true;
		} else {
			// ȊO
			return false;
		}
	}

	/**
	 * T}Ԃ̃TCg邩ǂ𒲂ׂ
	 * @return T}Ԃ̃TCg
	 */
	public int countSuspendedSites() {
		return suspender.size();
	}

	/**
	 * SURLTCg𒊏o
	 * @param url URL
	 * @return hC
	 */
	private String getSiteNameByURL(String url) {
		// SURLChC̕؂o
		try {
			URL u = new URL(url);
			return u.getHost() + (u.getPort() > 0 ? ":" + u.getPort() : "");
		} catch(MalformedURLException e) {
			return null;
		}
	}

	/**
	 * irQ[V[hԂ
	 * @return true:UVXe / false:standalone
	 */
	public boolean getNavigationMode() {
		return navimode;
	}
	/**
	 * oHirQ[^[IɃVbg_E
	 */
	public void shutdown() {
		shutdownFlg = true;
	}
}

// end of RSSWebNavigator.java
