/**
 * RSSWebNavigator
 * RSS Web񃍃{bgɒTׂiH肷NX
 * KvɉāCiH}X^[̃T[o[ƂƂ肵C̐iH񎦂
 */
package org.logical_paradox.rss.robot;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.rmi.Naming;
import java.rmi.NotBoundException;
import java.rmi.RemoteException;
import java.util.Calendar;
import java.util.Hashtable;
import java.util.Vector;

import org.logical_paradox.common.net.IPAddressRangeSet;
import org.logical_paradox.common.net.IPAddressRangeSetFactory;
import org.logical_paradox.rss.IllegalObjectStateException;
import org.logical_paradox.rss.RSSIllegalConfigException;
import org.logical_paradox.rss.dsync.SyncQueueException;
import org.logical_paradox.rss.http.HREFCollector;
import org.logical_paradox.rss.http.HTMLContentsFilter;
import org.logical_paradox.rss.http.NoIndexException;
import org.logical_paradox.rss.http.Site;
import org.logical_paradox.rss.http.URLPool;
import org.logical_paradox.rss.http.URLSuspender;
import org.logical_paradox.rss.http.WebContents;
import org.logical_paradox.rss.lcmgr.RSSLocalContentsMgr;
import org.logical_paradox.rss.logging.RSSDebugLogWriter;
import org.logical_paradox.rss.robot.event.SNodeDSyncEventListener;
import org.logical_paradox.rss.router.RNodeDSyncMngr;
import org.logical_paradox.rss.router.RSSRoutingNodeSelector;
import org.logical_paradox.rss.router.RoutingNodeClient;
import org.logical_paradox.rss.router.SiteLock;
import org.logical_paradox.rss.router.algorithm.RoutingAlgorithm;
import org.logical_paradox.rss.util.RandomKeyGenerator;


public class RSSWebNavigator {
	public static String NAVIGATIOR_CONF_FILENAME = "conf/rssnavigator.conf";

	private RSSNavigatorConfig cfg = null;

	private URLPool urls = null;
	private int nowFetchCnt = 0;
	private int maxFetchCnt = 0;
	private RoutingAlgorithm algo = null;
	private Hashtable sites = null;

	private RSSRoutingNodeSelector selector = null;			// oHm[hIq
	private RoutingNodeClient rnClient = null;				// oHm[h̃NCAg
	private String robotId = null;							// 󂯎Tm[hID
	private int limKeepLocks = 0;							// ɕۗLł郍bN̐
	private URLSuspender suspender = null;					// T}~ΏۂURLǗIuWFNg
	private long siteLockLastGCTime = 0;					// ŌɒTbNǗIuWFNgɑ΂GCN

	private boolean navimode = true;						// irQ[V[h(true:[g false:X^hA[)
	private RSSLocalContentsMgr lcm = null;					// [JRecǗVXe

	private RNodeDSyncMngr dsyncman;
	protected boolean flg_EchoURL = true;


	protected RSSWebNavigator() {
		urls = new URLPool();
		sites = new Hashtable();
		suspender = new URLSuspender();
	}

	public static RSSWebNavigator getNavigator()
					throws IllegalArgumentException, IllegalObjectStateException, RSSIllegalConfigException, IOException {

		RSSWebNavigator n = new RSSWebNavigator();

		// oHirQ[^p̃RtBO[h
		n.cfg = new RSSNavigatorConfig( NAVIGATIOR_CONF_FILENAME );
		n.navimode = n.cfg.getNavigationMode();

		RSSDebugLogWriter.write( "RSSTVXe" + (n.navimode==true?"UVXe":"X^hA[VXe") + "ƂĐݒ肵܂" );

		n.maxFetchCnt = n.cfg.getDebugTraceURLs();
		n.flg_EchoURL = n.cfg.getEchoURL();
		n.limKeepLocks = n.cfg.getKeepLocks();
		n.siteLockLastGCTime = Calendar.getInstance().getTimeInMillis();

		n.algo = n.cfg.getRoutingAlgorithm();

		RSSDebugLogWriter.write( "oHASY'" + n.algo.getAlgorithmName() + "'Ɏw肳܂" );

		try {
			if( n.navimode == false ) {
				// TJnURLo^鏈
				String root = n.cfg.getRoot();
				if( root == null || root.trim().length() == 0 ) {
					// 'ROOT'̍s݂Ȃꍇ
					throw new IllegalArgumentException();
				}

				n.urls.add( root );

				// X^hA[̏ꍇ͒Tm[hIDĂȂ̂ŁCŏɍ쐬
				n.robotId = "SNODE:" + RandomKeyGenerator.getUniqKey();
			} else {
				// oHm[hTJnʒuႤꍇ
				// ܂oHm[hփT[rXC
				// oHm[hICڑp̃NCAgX^u擾
				RSSDebugLogWriter.write( "oHm[hɃT[rXCĂ܂" );
				String nodeSelectorURL = n.cfg.getRoutingNodeSelectorURL().trim();
				RSSDebugLogWriter.write( "oHm[hIq: " + nodeSelectorURL );

				n.selector = (RSSRoutingNodeSelector)Naming.lookup( nodeSelectorURL );
				n.rnClient = n.selector.bindSearchNode();
				n.robotId = n.rnClient.getNodeId();

				RSSDebugLogWriter.write( "Tm[hoHm[hɃoCh܂" );
				RSSDebugLogWriter.write( "Tm[hID: " + n.robotId );

				// IPAhX͈̓Zbg𐶐
				IPAddressRangeSet rangeSet = IPAddressRangeSetFactory.getIPAddressRangeSet();
				rangeSet.setRanges( n.rnClient.getAllowedRoutingIPRanges() );

				// f[X^[g
				n.dsyncman = new RNodeDSyncMngr( n.cfg, n.rnClient, 1000, n.urls, rangeSet );
				n.dsyncman.addListener( new SNodeDSyncEventListener() );
				n.dsyncman.start();
			}
		} catch( NotBoundException nbe ) {
			RSSDebugLogWriter.write( "oHm[hIT[o[܂ł" );
			throw new IllegalObjectStateException( nbe.getMessage() );
		} catch( MalformedURLException me ) {
			RSSDebugLogWriter.write( "oHm[hIT[o[URL" );
			throw new IllegalObjectStateException( me.getMessage() );
		} catch( RemoteException re ) {
			RSSDebugLogWriter.write( "oHm[hƂ̒ʐMɎs܂" );
			throw new IllegalObjectStateException( re.getMessage() );
		} catch( Exception e ) {
			RSSDebugLogWriter.write( "̑̌ɂG[ : " + e.getMessage() );
			e.printStackTrace();
			throw new IllegalObjectStateException( e.getMessage() );
		}

		return n;
	}


	/**
	 * [JRecǗVXeݒ肷
	 * ݒ肵ȂꍇC[J̃RecǗVXe͌Ă΂Ȃ
	 */
	public void enableLocalContentsMgr( boolean mode ) throws RSSIllegalConfigException {
		if( mode == true ) {
			// [JRecǗVXeLɂꍇ
			if( lcm == null ) {
				// ܂[JRecǗVXeĂȂꍇ
				// 
				lcm = new RSSLocalContentsMgr( navimode );
			}
			// [JRecǗVXẽXCb`
			// ɂāCXbhJn
			lcm.start();
		} else if( lcm != null ) {
			// [JRecǗVXe~
			lcm.done();
		}
	}

	/**
	 * wURL̒TbNĂ邩ǂׂ
	 * bNĂȂꍇ́CbN擾悤Ǝ݂
	 * ʂƂĂǂĂbN擾łȂꍇfalseԂ
	 * ɃbNĂ邩CbN擾邱ƂoꍇtrueԂ
	 */
	public boolean isLocked( String url ) {
		boolean rc = true;
		String siteName = getSiteNameByURL( url );
		if( siteName == null ) {
			return false;
		}

		// ɏ\Ȑ̃bNۗLĂꍇCoHm[h֓]
		if( dsyncman.findLock( siteName, getRobotId() ) == null && dsyncman.numOfMyLocks( getRobotId() ) >= limKeepLocks ) {
			try {
				dsyncman.poolURL( url );
			} catch( SyncQueueException se ) {
			}
			return false;
		}

		// w肳ꂽTCgɑ΂郍bNmFC󋵂ɂĂ͎擾\Ă݂
		// ʂƂāCbNێĂȂC邢͎擾łȂꍇ͏I
		if( requireSiteLock( siteName, getRobotId() ) == false ) {
			RSSDebugLogWriter.write( url + "bNł܂ł" );
			return false;
		}

		return rc;
	}

	/**
	 * w肳ꂽURLTĂǂǂ𒲂ׂ
	 * ̒ł́Crobots.txtQƂ邾
	 * META^Ow肳Ă̂ɂẮCRecliKŔ肷
	 */
	public boolean isAllowed( String url ) {
		boolean rc = false;

		String siteName = getSiteNameByURL( url );
		if( siteName == null ) {
			return false;
		}

		// TCgIuWFNg̍\z
		synchronized( sites ) {
			Site so = (Site)sites.get( siteName );
			if( so == null ) {
				try {
					RSSDebugLogWriter.write( "TCgIuWFNg쐬Ă܂" );
					so = Site.getInstance( url, cfg.getProperty( "HTTP_USER_AGENT" ) );
					sites.put( siteName, so );

				} catch( IllegalArgumentException e ) {
					// TCgIuWFNg𐶐邱ƂłȂ̂ŁCƂ肠
					// _ƔfĂ .. ɂ܂
					return false;
				}
			}

			if( so.hasRule() == false || so.isAllowed( url, cfg.getHttpUserAgent() ) == true ) {
				// robots.txt݂ȂCƂĂĂꍇ
				rc = true;
			} else {
				// robots.txt݂āC֎~Ă̈̏ꍇ
				rc = false;
			}
		}

		RSSDebugLogWriter.write( url + "́C" + (rc==true?"":"֎~") + "Ă܂" );
		return rc;
	}

	/**
	 * w肳ꂽTCgɑ΂郍bNmF/\
	 * bNێĂC邢͐\̌ʂƂĎ擾łꍇtrue
@	 * Ƃɂ̒Tm[hł́CƂ̏oȂTCg̏ꍇfalseԂ
	 */
	protected boolean requireSiteLock( String dn, String rid ) {
		// ܂[J̃bNǗIuWFNgmF
		if( dsyncman.findLock( dn, rid ) != null ) {
			return true;
		}

		// bN\Ă݂
		boolean rc = true;
		SiteLock l = new SiteLock( dn, rid );
		try {
			rc = dsyncman.lock( l );
		} catch( SyncQueueException se ) {
			rc = false;
		}
/*
			// UVXeƂē삵ĂꍇɂCoHm[hɖ₢킹sȂ
			if( navimode == RSSNavigatorConfig.NAVIMODE_DISTRIB && rnClient.lock( l ) == false ) {
				rc = false;
			} else {
				// bN擾ł̂ŁC[J̃bNǗIuWFNgɂo^
				dsyncman.lock( l );
				rc = true;
			}
*/

		return rc;
	}

	/**
	 * TCgTASYݒ肷
	 */
	public void setRoutingAlgorithm( RoutingAlgorithm r ) {
		algo = r;
	}

	/**
	 * Tm[hIDԂ
	 */
	public String getRobotId() {
		return robotId;
	}

	/**
	 * irQ[^N[Y
	 * ̏sƁCoHm[hTm[h؂藣
	 */
	public void close() throws RemoteException {
		// [JRecǗVXe̒~
		// ]̃RecSăRecǗVXe֓]
		if( lcm != null ) {
			RSSDebugLogWriter.write( "[JRecǗVXe~Ă܂" );
			try {
				enableLocalContentsMgr( false );
				RSSDebugLogWriter.write( "[JRecǗVXe~܂" );
			} catch( Exception e ) {
				RSSDebugLogWriter.write( "~ɗO܂:\n" + e.getMessage() );
			}
		}

		if( navimode == RSSNavigatorConfig.NAVIMODE_STANDALONE ) {
			// P̂œ삵Ăꍇ͉ȂŕԂ
			return;
		}

		RSSDebugLogWriter.write( "oHm[hTm[h'" + getRobotId() + "'؂藣Ă܂" );

		if( selector == null ) {
			throw new RemoteException( "navigator has been already closed" );
		}

		// ؂藣
		selector.unbindSearchNode( getRobotId() );

		RSSDebugLogWriter.write( "'" + getRobotId() + "'́CoHm[h؂藣܂" );

		selector = null;
		rnClient = null;
	}

	/**
	 * ŤʁCꂽURLo^
	 * Kvɉă}X^[̃T[o[֓]
	 */
	public void sendContents( WebContents c )
									throws IllegalObjectStateException, MalformedURLException, IOException {

		boolean needToStore = true;

		// 莞Ԃo߂ĂꍇCTbN̂߂GCN
		if( Calendar.getInstance().getTimeInMillis() - siteLockLastGCTime >= cfg.getSiteLockExpire() ) {
			RSSDebugLogWriter.write( "TbÑKx[WRN^N܂" );
			SiteLock[] removedLockObjs = dsyncman.gc();
			siteLockLastGCTime = Calendar.getInstance().getTimeInMillis();
		}

//System.err.println( "1" );
		if( c == null ) {
			// WebRec܂܂ĂȂꍇ́CȂŏ߂
			return;
		}

		if( c.getURL().toString().indexOf( "robots.txt" ) >= 0 ) {
			// ŤʁC擾y[Wrobots.txtꍇ
			releaseSearchLock( c );
			// ȏ͕̏KvȂ
			return;
		}
//System.err.println( "2" );

		if( c.getWebContents() == null ) {
			// RecȂ̂ŁĈ܂ܕԂ
			return;
		}

//System.err.println( "3" );
		// AJ[^O̎W
		Vector anchors = null;
		try {
			anchors = HREFCollector.collect( c.getWebContents() );
		} catch( NoIndexException noe ) {
			// Yy[Windexing֎~Ăꍇ
			if( noe.isFollowable() == true ) {
				anchors = noe.getAnchorsInThisPage();		// null̏ꍇ̂Œ

				// webRec𖳌鏈
				needToStore = false;
			}
		}
		Vector an = HREFCollector.getAnchorsAsABSPath( c.getURL(), anchors );
		if( an == null ) {
			return;
		}
//System.err.println( "N: " + an.size() );
//System.err.println( "4" );

		int receives = maxFetchCnt == 0 || (an.size() + nowFetchCnt) < maxFetchCnt ? an.size() : maxFetchCnt - nowFetchCnt;

		int nowURLs = urls.size();

// 2003-09-02 urlsgsynchronized݂
		try {
			for( int i = 0; i < receives; i++ ) {
				if( dsyncman != null && nowURLs >= cfg.getKeepUrls() && navimode == RSSNavigatorConfig.NAVIMODE_DISTRIB ) {
					// ȏURLۊǂłȂꍇ͌oHm[h֓]邽߂Ƀv[
					// CUVXeƂē삵Ăꍇ̂݁D]͓f[ɂ
					dsyncman.poolURL( (String)an.elementAt( i ) );
				} else {
					// ]͂̂ŁCTm[hŏ
					urls.add( (String)an.elementAt( i ) );
					nowURLs++;
				}
			}
		} catch( SyncQueueException se ) {
		}

		int beforeCnt = nowFetchCnt / 1000;
		nowFetchCnt += receives;
		int afterCnt = nowFetchCnt / 1000;

		if( beforeCnt != afterCnt ) {
			System.err.println( "`FbN|Cgʉ:" + afterCnt * 1000 + "" );
		}

//System.err.println( "5" );

		// HTMLv[eLXgɕϊtB^̒ǉ
		c.addContentsFilter( new HTMLContentsFilter() );

		// RecǗVXeɑ΂āCRec𑗏o
		// RecindexingĂ̂ɂĂ̂
		if( lcm != null && needToStore == true ) {
			synchronized( lcm ) {
				while( lcm.isReceivable() == false ) {
					try {
//System.err.println( "7" );
						lcm.wait();
//System.err.println( "8" );
					} catch( InterruptedException e ) {}
				}
			}

			lcm.store( c );
		}
	}


	/**
	 * T}Ԃ畜A
	 * robots.txt擾łꍇCTCgIuWFNgɑ΂ă[ݒ肷
	 */
	public void releaseSearchLock( WebContents c ) throws IllegalObjectStateException {
		// f[^ɂSiteIuWFNgXV(y[W擾[ݒ肷)
		String dn = getSiteNameByURL( c.getURL().toString() );
		if( dn == null ) {
			return;
		}

		Site s = (Site)sites.get( dn );
		if( s == null ) {
			// OɃTCgIuWFNgĂȂ̂ŁC
			throw new IllegalObjectStateException( "TCgIuWFNgȂ" );
		}
		String webContents = c.getWebContents();
		if( webContents == null || webContents.trim().length() == 0 ) {
			// Rec܂łȂꍇ(404̏ꍇ)͉Ȃ
			RSSDebugLogWriter.write( "URL: " + c.getURL().toString() + "݂͑܂ł" );
		} else {
			// robots.txtŁC炩̃f[^܂łꍇ͓WJ
			ByteArrayInputStream in = new ByteArrayInputStream( webContents.getBytes() );
			try {
				s.getRuleFromTheSite( in );
				RSSDebugLogWriter.write( "TCg'" + dn + "'̃[ݒ肵܂" );
			} catch( IOException ie ) {
				RSSDebugLogWriter.write( "robots.txt̉͂Ɏs" );
			}
		}

		// robots.txtۂɂǂɊւ炸CŒT}Ԃ畜A
		try {
			sites.put( dn, s );
			String[] releasedURLs = suspender.release( dn );
			if( releasedURLs != null ) {
				// ꂽURL݂̂ŁCSL[ɖ߂
				pushbackURLs( releasedURLs );
			}
			RSSDebugLogWriter.write( "TCg'" + dn + "'̒T}Ԃ܂" );
		} catch( IllegalArgumentException ie ) {
			RSSDebugLogWriter.write( "TCg'" + dn + "'̉Ɏs܂" );
		}
	}

	/**
	 * ɒTׂURL擾
	 * oURL͎IɃXg폜
	 */
	public String getNextURL() {
		String rc = null;

		while( (rc = algo.nextURL( urls )) != null ) {
			// oꂽURLC擾Ă邩ǂ𒲂ׂ
			// ĂȂꍇCURL͎̂Ă
			try {
				if( suspender.isSuspended( rc ) == true ) {
					// T}ĂURLȂ̂ŁCT}IuWFNgɓo^(L[͍폜邪CT}ꂽiKōē)
					String dn = getSiteNameByURL( rc );
					if( dn != null ) {
						suspender.suspend( Site.getInstance( dn, cfg.getHttpUserAgent() ), rc );
					}
				}
			} catch( IllegalArgumentException e ) {
				// URL̂Ŏ̂Ă
				// ȂƎ̂ĂƂɂȂ
				continue;
			}

			// oꂽURLɑΉSiteIuWFNg݂邩ǂ𒲂ׂ
			String dn = getSiteNameByURL( rc );
			if( dn == null ) {
				// URL̂Ŏ̂Ă
				continue;
			}

			boolean robotsfg = false;

			// bN̎擾
			// bN擾ɖoHm[h֐ڑ̂ŁC
			// ptH[}XIɖ肪̂ł
			if( isLocked( rc ) == true ) {

				// bN擾łC邢͍ŏ珊Ăꍇ
				if( sites.get( dn ) == null ) {

					// TCgIuWFNg݂Ȃꍇrobots.txtURLƂĕԂ
					Site s = Site.getInstance( rc, cfg.getHttpUserAgent() );

					// ɁCTCgIuWFNgT}ǗIuWFNgɓo^
					suspender.suspend( s, rc );

					// TCrobots.txtQƂN_ɂȂURL폜
					// URLƂōēxT邽߂ɕKvɂȂ
					algo.removeFromHistory( rc );

					// Vrobots.txtURL쐬ĕԂ
					rc = s.getRobotsTxtURL();

					// ƁCTCgIuWFNgo^
					sites.put( dn, s );
					break;
				} else if( isAllowed( rc ) == true ) {
					// TCgIuWFNg݂ꍇŁCTĂꍇ
					// Ōł悢̂ŁC[vEo
					break;
				}
			} else {
				// bN擾łȂ̂ŁCURL͂ȂƂɂ
				// ȂƎ̂ĂƂɂȂ
				// Kvł΁Cɕʂ̒Tm[hւ̓]Ăǂ
			}
		}

		if( flg_EchoURL == true ) {
//			System.err.println( Thread.currentThread().getName() + " " + rc );
		}
		return rc;
	}

	/**
	 * URLvbVobN
	 */
	private void pushbackURLs( String[] u ) {
		if( u == null ) {
			return;
		}

		for( int i = 0; i < u.length; i++ ) {
			try {
				urls.add( u[i] );
			} catch( MalformedURLException ue ) {
				// ǉ悤ƂURLɉ肪ꍇ͖
			}
		}
	}

	/**
	 * URL̎cʂԂ
	 */
	public int getQueueSize() {
		return urls.size();
	}

	/**
	 * URL0̊Ԃ̓Xbh~
	 */
	public void waitUntilEmpty() {
		urls.waitUntilEmpty();
	}

	/**
	 * ̌oHirQ[^ɋ@\~Ă邩ǂԂ
	 * w肳ꂽURLi[CSă{bgTꍇ͒~ԂɂȂ
	 */
	public boolean isFinished() {
		if( countSuspendedSites() + urls.size() == 0 && maxFetchCnt > 0 && nowFetchCnt >= maxFetchCnt ) {
			// T}URLƃL[ۂŁCi[lݒ肳ĂāC̐ꍇ
			return true;
		} else {
			// ȊO
			return false;
		}
	}

	/**
	 * T}Ԃ̃TCg邩ǂ𒲂ׂ
	 */
	public int countSuspendedSites() {
		return suspender.size();
	}

	/**
	 * SURLTCg𒊏o
	 */
	private String getSiteNameByURL( String url ) {
		// SURLChC̕؂o
		try {
			URL u = new URL( url );
			return u.getHost() + (u.getPort() > 0 ? ":" + u.getPort() : "");
		} catch( MalformedURLException e ) {
			return null;
		}
	}

	/**
	 * irQ[V[hԂ
	 */
	public boolean getNavigationMode() {
		return navimode;
	}
}

// end of RSSWebNavigator.java
