il y a cet analyseur morphologique (open source, écrit en OCml) nommé ocamorphe. La java est buggée et je vais devoir la réparer et après quelques heures de combat, il me semble que ça va prendre quelques jours pour la réparer car je ne connais pas C, JNI , OCml et ce logiciel particulier.Erreur de segmentation JNI bug
Ici vous pouvez voir que pour un petit fichier (subtitles_136.hu.tok) il fonctionne, mais pour un fichier plus volumineux (Tolkien_1.hu.tok) « erreur de segmentation » est lancée:
[email protected]:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache2.txt > src/java/mokk/nlp/ocamorph/subtitles_136.hu.stem < src/java/mokk/nlp/ocamorph/subtitles_136.hu.tok
[email protected]:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache.txt > src/java/mokk/nlp/ocamorph/Tolkien_1.en.stem < src/java/mokk/nlp/ocamorph/Tolkien_1.en.tok
Segmentation fault
[email protected]:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ ls -l src/java/mokk/nlp/ocamorph/
total 2116
-rw-rw-r-- 1 bpgergo breka 8505 2009-09-22 13:53 cache2.txt
-rw-rw-r-- 1 bpgergo breka 65 2009-07-07 18:48 Compounds.java
drwxrwxr-x 2 bpgergo breka 4096 2009-09-22 13:54 CVS
-rw-rw-r-- 1 bpgergo breka 5888 2009-09-18 17:19 FileStemmer.java
-rw-rw-r-- 1 bpgergo breka 77 2009-07-07 18:48 Guess.java
-rw-rw-r-- 1 bpgergo breka 953 2009-08-31 18:58 IOcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 5419 2009-08-31 18:58 OcamorphCachedStemmer.java
-rw-rw-r-- 1 bpgergo breka 2836 2009-08-03 16:00 OcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 4612 2009-09-22 12:51 OcamorphWrapper.java
-rw-rw-r-- 1 bpgergo breka 6731 2009-09-22 13:53 subtitles_136.hu.stem
-rw-rw-r-- 1 bpgergo breka 7356 2009-09-20 21:12 subtitles_136.hu.tok
-rw-rw-r-- 1 bpgergo breka 2907 2009-09-18 17:22 Tester.java
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:53 Tolkien_1.en.stem
-rw-rw-r-- 1 bpgergo breka 1033059 2009-09-17 16:09 Tolkien_1.en.tok
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:14 Tolkien_1.hu.stem
-rw-rw-r-- 1 bpgergo breka 1041968 2009-09-17 16:09 Tolkien_1.hu.tok
[email protected]:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $
C'est la partie C de la liaison Java (/ocamorph/src/bindings/java/src/c/hunmorph_jnistub.c). Cela pourrait être la partie buggy, merci pour tout soupçon ou de l'aide pour trouver le bug:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mokk_nlp_ocamorph_OcamorphWrapper.h"
#include "ocamorph.h"
#define MAX_ANALYSIS 100
#define ANALYSIS_MAXLEN 100
// initialize the analysis string
char analysis[ANALYSIS_MAXLEN];
// initialize input buffer
char buffer[500];
char* analyses[MAX_ANALYSIS];
jmethodID MID_InstanceMethodCall_callback;
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_initIDs
(JNIEnv *env, jclass cls) {
MID_InstanceMethodCall_callback =
(*env)->GetMethodID(env, cls, "callback", "([B)V");
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_init
(JNIEnv * env, jobject obj, jstring bin_arg) {
/* Convert to UTF8 */
const char *bin_file = (*env)->GetStringUTFChars(env, bin_arg, JNI_FALSE);
ocamorph_startup();
ocamorph_engine engine = init_from_bin(bin_file,0/*Don't pass the stupid no_caps argument*/);
/* Release created UTF8 string */
(*env)->ReleaseStringUTFChars(env, bin_arg, bin_file);
int i;
for (i=0; i<MAX_ANALYSIS;i++) {
analyses[i] = (char *) malloc(ANALYSIS_MAXLEN * sizeof(char));
};
return (jlong) engine;
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_make_1analyzer
(JNIEnv *env, jobject obj, jlong engine , jint blocking, jint compunds, jint stop_at_first, jint guess) {
ocamorph_engine analyzer = make_analyzer((ocamorph_engine) engine, blocking, compunds, stop_at_first, guess);
return (jlong) analyzer;
}
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_analyze
(JNIEnv * env, jobject obj, jlong analyzer, jbyteArray word) {
ocamorph_engine analyzerc = (ocamorph_engine) analyzer;
/* Convert to UTF8 */
// const char *wordc = (*env)->GetStringUTFChars(env, word, JNI_FALSE);
//char *wordc = (char *) (*env)->GetByteArrayElements(env, word, 0);
const int maxInputLength = 1000;
char wordc[maxInputLength];
jsize len = (*env)->GetArrayLength(env,word);
if (len>=maxInputLength) { len = maxInputLength-1; }
if (len!=0)
{
(*env)->GetByteArrayRegion(env,word,0,len,(jbyte*)wordc);
}
wordc[len] = '\0';
int n = analyze(analyzerc,wordc,analyses,MAX_ANALYSIS, ANALYSIS_MAXLEN);
int i;
for (i=0; i < n; ++i) {
// jstring ana = (*env)->NewStringUTF(env, analyses[i]);
char* ana = analyses[i];
jbyteArray jb=(*env)->NewByteArray(env, strlen(ana));
(*env)->SetByteArrayRegion(env, jb, 0, strlen(ana), (jbyte *)ana);
(*env)->CallVoidMethod(env, obj, MID_InstanceMethodCall_callback, jb);
}
// (*env)->ReleaseStringUTFChars(env, word, wordc);
}
Et voici la partie Java (/ ocamorph/src/fixations/java/src/java/Mokk/NLP/ocamorph /OcamorphWrapper.java):
package mokk.nlp.ocamorph;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;
/**
* JNI interface for Ocamorph. Constructor loads ocamorph engine and a specified binary resource.
*
* @author bpgergo
*
*/
public class OcamorphWrapper {
private long analyzerId;
private long engineId;
private native static void initIDs();
private native long init(String bin);
// const ocamorph_engine engine, const int blocking, const int compounds,
// const int stop_at_first, const int guess
// valami hiba van az ocamorph-ban, mert a stop_at_first vezerli az
// osszetettszosagot
private native long make_analyzer(long engine, int blocking, int compounds,
int stop_at_first, int guess);
private native void analyze(long analyzer, byte[] word);
static {
//TODO FIXME how to define the library dynamically?
System.loadLibrary("ocamorph");
initIDs();
}
/**
* the encoding required by the ocamorph lib
*/
private static String encoding = "ISO-8859-2";
//private static boolean debug = false;
/**
* analyze result (the callback will add the result strings)
*/
private List<String> analyzeResult = null;
/**
* Loads a new Ocamorph engine, using the given binary resource and the arguments.
*
* @param bin
* @param blocking
* @param stopAtFirst
* @param compounds
* @param guess
*/
public OcamorphWrapper(String bin, boolean blocking, boolean stopAtFirst,
Compounds compounds, Guess guess) {
super();
engineId = init(bin);
int comp = compounds2Code(compounds);
int gu = guessToCode(guess);
analyzerId = make_analyzer(engineId, boolean2Code(blocking), boolean2Code(stopAtFirst),
comp, gu);
//debug("engineId:"+engineId);
//debug("analyzerId:"+analyzerId);
//debug = false;
}
/**
* This is the interface method for ocamorph analysis for the java side.
* @param ba
*/
public List<String> analyze(String word) {
//debug("analyze:");
analyzeResult = new LinkedList<String>();
byte[] ba = null;
try {
ba = word.getBytes(encoding);
} catch (UnsupportedEncodingException e1) {
System.err
.println("Ocamorph analyze UnsupportedEncodingException: ");
e1.printStackTrace();
}
if (ba != null){
//debug //printBytes(ba, "analizze:");
analyze(analyzerId, ba);
}
return analyzeResult;
}
/**
* The C interface will call this method to return analysis results
*/
private void callback(byte[] ana) {
String s = null;
try {
// bpgergo 20090618 this was a bug
// s = new String(ana);
s = new String(ana, encoding);
} catch (UnsupportedEncodingException e) {
System.err.println("callback new String(ana, encoding) UnsupportedEncodingException:");
e.printStackTrace();
}
analyzeResult.add(s);
//if (s != null) {
//debug("!callback recieved: ");
// debug //printBytes(ana, s);
//} else {
//debug("callback s == null");
//}
}
/* static argument conversion methods */
private static int boolean2Code(boolean bool){
if (bool){
return 1;
} else {
return 0;
}
}
private static int compounds2Code(Compounds compounds){
int comp = 0;
switch (compounds) {
case No:
comp = 0;
break;
case Allow:
comp = 1;
break;
}
return comp;
}
private static int guessToCode(Guess guess){
int gu = 0;
switch (guess) {
case NoGuess:
gu = 0;
break;
case Fallback:
gu = 1;
break;
case Global:
gu = 2;
break;
}
return gu;
}
public String getEncoding() {
return encoding;
}
public long getAnalyzerId() {
return analyzerId;
}
/*private static void debug(String string) {
if (debug) {
System.out.println(string);
}
}*/
/* getter/setter methods */
public boolean isDebug() {
return false; //debug;
}
public void setDebug(boolean debug) {
//OcamorphWrapper.debug = debug;
}
/* static debug methods */
/*public static void printBytes(byte[] array, String name) {
if (debug) {
for (int k = 0; k < array.length; k++) {
debug(name + "[" + k + "] = " + "0x" + byteToHex(array[k]));
}
}
}*/
/*static public String byteToHex(byte b) {
// Returns hex String representation of byte b
char hexDigit[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'a', 'b', 'c', 'd', 'e', 'f' };
char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
return new String(array);
}*/
/*static public String charToHex(char c) {
// Returns hex String representation of char c
byte hi = (byte) (c >>> 8);
byte lo = (byte) (c & 0xff);
return byteToHex(hi) + byteToHex(lo);
}*/
}