Utente:LeDeuxiemeTexte/GoogleOCRFrench.js

'A Wikisource.

Notarella: Aroppo pubbreca putisse necessità 'e pulezzà 'a caché d' 'o navigatóre pe vedé 'e cagnamienti.

  • Firefox / Safari: Sprémme 'o buttóne maiuscole e ffà clic ncopp'a Recarreca, o pure spremme Ctrl-F5 o Ctrl-R (⌘-R ncopp'a Mac)
  • Google Chrome: spremme Ctrl-Shift-R (⌘-Shift-R ncopp'a nu Mac)
  • Internet Explorer/edge: Spremme 'o buttóne Ctrl pe' tramente ca faie click ncopp'a Refresh, o pure spremmere Ctrl-F5
  • Opera: Vaje addò 'o Menu → Mpustaziune (Opera → Mpustaziune ncopp' 'o Mac) e po' ncopp'a Privacy & sicurezza → Pulezza date d' 'o browser → Immaggene e file d' 'a cache.
/*jshint boss:true*/
/*global $, mw*/

/*
Original script from https://nap.wikisource.org/wiki/Utente:Alex_brollo/GoogleOCR.js
To use it: https://nap.wikisource.org/w/index.php?title=User:George2etexte/GoogleOCRFrench.js&action=raw&ctype=text/javascript
Added some code below to post-process texts in French
*/

/**
 * This script adds a toolbar button that replaces the editbox text with OCR text
 * derived by sending the .prp-page-image image through Google's Vision API.
 *
 * For more information, see https://wikisource.org/wiki/Wikisource:Google_OCR
 */

( function ( mw, $ ) {
	var lang = mw.config.get( 'wgContentLanguage' );
	// Questo if ridefinisce lang in "it" per le tre wikisource italiane minori
	if (["nap","vec","pms"].indexOf(lang)!==-1) {
    	lang="it";
	}
	var toolUrl = "//tools.wmflabs.org/ws-google-ocr/api.php";
	var loadingGifUrl = '//upload.wikimedia.org/wikipedia/commons/4/42/Loading.gif';
	var sysMessages = [ 'google-ocr-button-label', 'google-ocr-request-in-progress', 'google-ocr-no-text', 'google-ocr-image-not-found' ];

	/**
	 * The initialisation function, run on every load. Adds the OCR button to the
	 * toolbar if we're currently editing or previewing in the Page namespace.
	 */
	function run() {
		var isPage, useOldToolbar, useBetaToolbar, toolbarLib;
		mw.loader.using( 'user.options', function () {
			isPage = mw.config.get( 'wgCanonicalNamespace' ) === 'Page';
			useOldToolbar = mw.user.options.get( 'showtoolbar' ) === 1;
			useBetaToolbar = mw.user.options.get( 'usebetatoolbar' ) === 1;
			if ( isPage && ( useOldToolbar || useBetaToolbar ) ) {
				toolbarLib = useBetaToolbar ? 'ext.wikiEditor' : 'mediawiki.toolbar';
				mw.loader.using( [ 'mediawiki.api', toolbarLib ], function () {
					new mw.Api().loadMessagesIfMissing( sysMessages ).then( function() { customizeToolbar( useBetaToolbar ); } );
				} );
			}
		} );
	}

	/**
	 * Add the OCR button to the toolbar. This is called in run, and doesn't
	 * need to check anything about whether we need to add the button.
	 *
	 * @param {boolean} useBeta Whether the WikiEditor toolbar should be used.
	 */
	function customizeToolbar( useBeta ) {

		// Add old-style toolbar button.
		if ( ! useBeta && mw.toolbar ) {
			mw.toolbar.addButton( {
				imageFile: 'https://upload.wikimedia.org/wikipedia/commons/c/ca/GoogleOcr_toolbar_button.png',
				speedTip: mw.msg( 'google-ocr-button-label' ),
				imageId: 'GoogleOcrButton'
			} );
			$("img#GoogleOcrButton").on('click', doOcr).css("width", "50px");
		}

		// Add new-style WikiEditor toolbar button.
		if ( useBeta ) {
			$( document ).ready( function () {
				var ocrButtonDetails = {
					type: 'button',
					icon: 'https://upload.wikimedia.org/wikipedia/commons/b/bd/GoogleOcr_WikiEditor_button.png',
					labelMsg: 'google-ocr-button-label',
					action: { type: 'callback', execute: doOcr }
				};
				var ocrButton = {
					section: 'main', // 'proofreadpage-tools',
					group: 'insert', // 'other',
					tools: { 'GoogleOcr': ocrButtonDetails }
				};
				$( "#wpTextbox1" ).wikiEditor( 'addToToolbar', ocrButton );
				$( "a[rel='GoogleOcr']" ).css("width", "42px");
			} );
		}

		// Pre-load the loading gif.
		$( '<img />' ).attr( 'src', loadingGifUrl ).appendTo( 'body' ).hide();
	}

	/**
	 * This function is run when the OCR button is clicked. It sends the page
	 * image to the API and replace the editbox's text with the restult.
	 */
	function doOcr() {
		if ( $( '.prp-page-image img' ).length === 0 ) {
			mw.notify( mw.msg( 'google-ocr-image-not-found' ) );
		}
		// Send the HTTPS URL because this will be accessed by PHP in the tool.
		showLoadingMsg( 'google-ocr-request-in-progress' );
		var imageUrl = 'https:' + $( '.prp-page-image img' ).attr('src');
		var requestUrl = toolUrl + "?image=" + imageUrl + "&lang="+lang;
		$.getJSON( requestUrl )
			.done( processOcrResult )
			.fail( processOcrResult ) // Same handler, for simplicity.
			.always( function () { showLoadingMsg( '' ); } );
	}

	/**
	 * The API result (either the OCR'd text, or an error message) is processed by
	 * this function.
	 *
	 * @param {string} data The response (either text or error) returned from the API.
	 */
	function processOcrResult( response ) {
		if ( response.responseJSON !== undefined && response.responseJSON.error ) {
			mw.notify( mw.msg( 'error' ) + ' ' + response.responseJSON.error.code + ' ' + response.responseJSON.error.message );
			return;
		}
		if ( response.text === undefined || response.text.length === 0 ) {
			mw.notify( mw.msg( 'google-ocr-no-text' ) );
			return;
		}
        // Postprocess French texts:
        console.log("postprocessing...")
        console.log(response);
        var text = response.text;
        // Replace - in the beginning of a line by — (for dialogues)
        text = text.replace(/\n-([^ ])/g, '\n— $1').replace(/\n- /g, '\n— ');
        // Glue together parts of words cut in the end of a line
        text = text.replace(/-[ ]*\n([^ ]+ )/g, '$1\n');
        // Remove the first lines if they are made only of digits or uppercase characters and punctuations (probably page headers)
        text = ('start¤'+text).replace(/(start¤)[0-9A-ZÉÈÊËÀÂÄÎÏÌÔÖÒÜÙÛÇ.\-, ]+\n/, '$1').replace(/(start¤)[0-9]+\n/, '$1').replace(/(start¤)[A-Z.\-, ]+\n/, '$1').replace(/(start¤)[0-9]+\n/, '$1').replace(/(start¤)[A-ZÉÈÊËÀÂÄÎÏÌÔÖÒÜÙÛÇ.\-, ]+\n/, '$1').replace(/start¤/, '');
        // Add a new line if the line is too short (end of a paragraph?)
		text = text.replace(/\n(.{2,20})\n/g, '\n$1\n\n');
		
		$( '#wpTextbox1' ).val( text );
	}

	/**
	 * Show (or hide) a loading message. Pass false to remove the message altogether.
	 *
	 * @param {string} msgLabel The label of the system message to show.
	 */
	function showLoadingMsg( msgLabel ) {
		var msg, msgBox, loadingGif
			loadingId = 'GoogleOcrLoading';

		// Always remove any existing message.
		$( '#' + loadingId ).remove();

		// Add the new message if required.
		if ( msgLabel.length !== 0 ) {
			msg = mw.message( msgLabel ).plain();
			msgBox = $( "<p>" )
				.attr( "id", loadingId )
				.css( "background-color", "#efefef" ).css( "border", "1px solid #ccc" )
				.text( msg );
			loadingGif = $( "<img>" )
				.attr( "src", loadingGifUrl )
				.attr( "alt", "Animated loading indicator" )
				.css( "display", "inline-block" ).css( "margin", "0.3em" );
			msgBox.prepend( loadingGif );
			$( '#wpTextbox1' ).before( msgBox );
		}
	}

	run();
}( mediaWiki, jQuery ) );