I get double speedup using the new ffi (see timings at top of files below). Strangely passing integers by ref is actually slower than passing by value (see last program right at the bottom) :
(p.s. I am not performing deallocation as I should, but this was just for testing)
The code is just an extension of that in http://lists.inf.ed.ac.uk/pipermail/polyml/2015-September/001643.html
************************************************* *************************************************
//intArray.c #include <stdlib.h> #include <stdio.h>
typedef struct _intArray { int size; int* arr; } intArray;
intArray* createIntArray(int size){ int i; intArray* p = (intArray*) malloc (sizeof(intArray)); p->arr = (int*) malloc (size*sizeof(int)); for(i=0; i<size; i++){ p->arr[i] = 0; } p->size = size; return p; }
void destroyIntArray(intArray* p){ free (p->arr); free (p); printf("destroyed\n"); }
void setIntArray(intArray* p, int elem, int val){ p->arr[elem] = val; }
void setIntArrayByRef(intArray* p, int* pelem, int* val){ p->arr[*pelem] = *val; }
int getIntArray(intArray *p, int elem){ return p->arr[elem]; }
void getIntArrayByRef(intArray *p, int* pelem, int* result){ *result = p->arr[*pelem]; }
int getSumIntArray(intArray* p){ int sum = 0; int i; int size = p->size; printf("size = %i\n",size); for(i=0; i<size; i++){ sum += p->arr[i]; //printf("elem = %i\n",sum); } //printf("%i\n",sum); return sum; }
************************************************* *************************************************
(* old ffi *) (* real 0m11.947s user 0m10.736s sys 0m1.160s *) open CInterface;
val lib = load_lib "./intArray.so"; val get = get_sym "./intArray.so";
val PINTARR = POINTER;
val c1 = call1 (get "createIntArray") INT PINTARR val c2 = call1 (get "destroyIntArray") PINTARR VOID val c3 = call3 (get "setIntArray") (PINTARR,INT,INT) VOID val c4 = call2 (get "getIntArray") (PINTARR,INT) INT val c5 = call1 (get "getSumIntArray") (PINTARR) INT
fun c_createIntArray (size) = c1 (size); fun c_destroyIntArray (p) = c2 (p); fun c_setIntArray (p,elem,value) = c3 (p,elem,value); fun c_getIntArray (p,elem) = c4 (p,elem); fun c_getSumIntArray (p) = c5 (p);
val size:int = 50000; val loops:int = 30; val cap:int = 50000;
fun loop (pData2) = let fun loopI i = if i = size then let val _ = () in c_setIntArray(pData2,0,c_getIntArray(pData2,size-1)); () end else let val previous = c_getIntArray(pData2,i-1); val use = if previous > cap then 0 else previous in c_setIntArray(pData2,i,use+1); loopI (i+1) end in loopI 1 end
fun benchmarkRun (pData2) = let fun bench i = if i = loops then () else let val _ = () in loop (pData2); bench (i+1) end in bench 1 end
fun main () = let val pData = c_createIntArray(size); val final = load_sym lib "destroyIntArray"; in setFinal final pData; benchmarkRun(pData); print (Int.toString (c_getSumIntArray (pData))); print "\n" end
************************************************* *************************************************
(* This uses new FFI *) (* real 0m5.382s user 0m4.332s sys 0m0.856s *) open Foreign;
val mylib = loadLibrary "./intArray.so";
val c1 = call1 (getSymbol mylib "createIntArray") cInt cPointer val c2 = call1 (getSymbol mylib "destroyIntArray") cPointer cVoid val c3 = call3 (getSymbol mylib "setIntArray") (cPointer,cInt,cInt) cVoid val c4 = call2 (getSymbol mylib "getIntArray") (cPointer,cInt) cInt val c5 = call1 (getSymbol mylib "getSumIntArray") (cPointer) cInt
fun c_createIntArray (size) = c1 (size); fun c_destroyIntArray (p) = c2 (p); fun c_setIntArray (p,elem,value) = c3 (p,elem,value); fun c_getIntArray (p,elem) = c4 (p,elem); fun c_getSumIntArray (p) = c5 (p);
val size:int = 50000; val loops:int = 30; val cap:int = 50000;
fun loop (pData2) = let fun loopI i = if i = size then let val _ = () in c_setIntArray(pData2,0,c_getIntArray(pData2,size-1)); () end else let val previous = c_getIntArray(pData2,i-1); val use = if previous > cap then 0 else previous in c_setIntArray(pData2,i,use+1); loopI (i+1) end in loopI 1 end
fun benchmarkRun (pData2) = let fun bench i = if i = loops then () else let val _ = () in loop (pData2); bench (i+1) end in bench 1 end
fun main () = let val pData = c_createIntArray(size); in benchmarkRun(pData); print (Int.toString (c_getSumIntArray (pData))); print "\n" end
************************************************* *************************************************
(* This uses new FFI & uses passes by ref*) (* real 0m8.637s user 0m6.652s sys 0m1.632s *) open Foreign;
val mylib = loadLibrary "./intArray.so";
val c1 = call1 (getSymbol mylib "createIntArray") cInt cPointer val c2 = call1 (getSymbol mylib "destroyIntArray") cPointer cVoid val c3 = call3 (getSymbol mylib "setIntArrayByRef") (cPointer,cStar cInt,cStar cInt) cVoid val c4 = call3 (getSymbol mylib "getIntArrayByRef") (cPointer,cStar cInt, cStar cInt) cVoid val c5 = call1 (getSymbol mylib "getSumIntArray") (cPointer) cInt
fun c_createIntArray (size) = c1 (size); fun c_destroyIntArray (p) = c2 (p); fun c_setIntArray (p,elem,value) = c3 (p,elem,value); fun c_getIntArray (p,elem,res) = c4 (p,elem,res); fun c_getSumIntArray (p) = c5 (p);
val size:int = 50000; val loops:int = 30; val cap:int = 50000;
fun loop (pData2) = let val rInt = ref 0; val rRes = ref 0; fun loopI i = if i = size then let val _ = () in rInt := size - 1; c_getIntArray(pData2,rInt,rRes); c_setIntArray(pData2,ref 0,rRes); () end else
let val () = (rInt := i - 1; c_getIntArray(pData2,rInt,rRes)) val use = if !rRes > cap then 0 else !rRes in rInt := i; rRes := use+1; c_setIntArray(pData2,rInt,rRes); loopI (i+1) end in loopI 1 end
fun benchmarkRun (pData2) = let fun bench i = if i = loops then () else let val _ = () in loop (pData2); bench (i+1) end in bench 1 end
fun main () = let val pData = c_createIntArray(size); in benchmarkRun(pData); print (Int.toString (c_getSumIntArray (pData))); print "\n" end
************************************************* *************************************************
On Sat, Oct 10, 2015 at 4:26 PM, David Matthews < David.Matthews at prolingua.co.uk> wrote:
On 10/10/2015 15:27, Artella Coding wrote:
Hi, thanks I tried the new FFI and it is definitely faster.
I'm glad it's already showing some improvement. There are some potential optimisations that can be made but I'll leave them for the moment.
Out of curiosity why do the stacks need to be seperate? Would it not be
possible to have a ForeignUnsafe for which they share the same stack? Thanks
Having separate stacks is the easiest way of doing it. It wouldn't be a good idea for C code to be run on an ML stack because it could run over the end. ML code checks for stack overflow; C code just runs until it hits a guard page and segfaults. Running ML code on the C stack might be possible. The main problem would be separating out the section(s) of the stack used by ML from that used by C. The GC needs to be able find and possibly update addresses that ML code has pushed to the stack. With callback functions there could be more than one interleaved section of ML stack. The GC could be invoked in a callback or by another thread while a thread is running C.
While I was writing the new FFI code I was thinking of the possibility of streamlining the calling of C with a view to possibly using it in place of the existing RTS call sequence. I think that's for the next phase and probably the next release.
David _______________________________________________ polyml mailing list polyml at inf.ed.ac.uk http://lists.inf.ed.ac.uk/mailman/listinfo/polyml