After this, it should be possible to get records from different databases, some
with many records, some with a few. This is a good testing ground for merging
rankings! Test first with a round-robin, and plot the scores.
+
+Thu 28-Nov
+Ok, I can now merge a number of SOLR databases (harvest jobs), and plot their rankings
+as solr gives them, in the order of different merge strategies
+Next: Add the normalizing merge strategy. Then plot different strategies against different queries
+Write a conclusion, and consider this plotting job done
+
+
+
+
--- /dev/null
+#!/bin/bash
+#
+# Run the test with a number of queries, plot the results
+#
+
+if [ "$1" == "" ]
+then
+ echo "Need an argument, the name of this test run"
+ echo "It will be in the title of all plots, together with the query"
+ exit 1
+fi
+TITLE="$1"
+OUTFILE=`echo $1.txt | sed 's/ /_/g'`
+echo "$TITLE" > $OUTFILE
+./test3.sh clean
+
+function onerun() {
+ QRY="$1"
+ echo "" >> $OUTFILE
+ echo "Query: $QRY" >> $OUTFILE
+ PNG=`echo "solr_$TITLE $QRY.png" | sed 's/ /_/g' `
+ echo "Graph: $PNG" >> $OUTFILE
+ ./test3.sh "$QRY" "$TITLE"
+ grep "plotline" show.out | head -10 >> $OUTFILE
+ cp plot.png $PNG
+}
+
+onerun "harry potter"
+onerun "vietnam war"
+onerun "water or fire or ice"
+echo "" >> $OUTFILE
+echo "client#, position, tf/idf, roundrobin, solr # database # title" >> $OUTFILE
+
<!-- Solr target -->
-<settings target="LUI Solr Test">
- <set name="pz:name" value="LUI Solr Test" />
- <set name="pz:url" value="lui.indexdata.com/solr" />
+<!-- General settings for all the solr targets in this test -->
+<settings target="*">
+ <!-- Individual databases in the solr system -->
+ <set target="lui.indexdata.com/solr#3902" name="pz:name" value="Solr base 3902 (25m)" />
+ <set target="lui.indexdata.com/solr#3902" name="pz:url" value="lui.indexdata.com/solr#3902" />
+ <set target="lui.indexdata.com/solr#3902" name="pz:extra_args" value="fq=database:3902" />
+
+ <set target="lui.indexdata.com/solr#5802" name="pz:name" value="Solr base 5802 (7m)" />
+ <set target="lui.indexdata.com/solr#5802" name="pz:url" value="lui.indexdata.com/solr#5802" />
+ <set target="lui.indexdata.com/solr#5802" name="pz:extra_args" value="fq=database:5802" />
+
+ <set target="lui.indexdata.com/solr#3602" name="pz:name" value="Solr base 3602 (4m)" />
+ <set target="lui.indexdata.com/solr#3602" name="pz:url" value="lui.indexdata.com/solr#3602" />
+ <set target="lui.indexdata.com/solr#3602" name="pz:extra_args" value="fq=database:3602" />
+
+ <set target="lui.indexdata.com/solr#6202" name="pz:name" value="Solr base 6202 (1.6m)" />
+ <set target="lui.indexdata.com/solr#6202" name="pz:url" value="lui.indexdata.com/solr#6202" />
+ <set target="lui.indexdata.com/solr#6202" name="pz:extra_args" value="fq=database:6202" />
+
+ <set target="lui.indexdata.com/solr#4905" name="pz:name" value="Solr base 4905 (100k)" />
+ <set target="lui.indexdata.com/solr#4905" name="pz:url" value="lui.indexdata.com/solr#4905" />
+ <set target="lui.indexdata.com/solr#4905" name="pz:extra_args" value="fq=database:4905" />
+
+ <set target="lui.indexdata.com/solr#6103" name="pz:name" value="Solr base 6103 (1k)" />
+ <set target="lui.indexdata.com/solr#6103" name="pz:url" value="lui.indexdata.com/solr#6103" />
+ <set target="lui.indexdata.com/solr#6103" name="pz:extra_args" value="fq=database:6103" />
+
+ <!-- General settings for them all -->
<set name="pz:limitmap:author" value="rpn:@attr 1=author_exact 6=3" />
<set name="pz:limitmap:subject" value="rpn:@attr 1=subject_exact" />
<set name="pz:limitmap:date" value="rpn:@attr 1=date @attr 6=3" />
<service>
<timeout session="60" z3950_operation="30" z3950_session="180"/>
- <!-- settings src="bibliotek.dk.xml"/-->
+ <!-- General SOLR settings -->
<settings src="solr.lui.xml"/>
+ <!-- A number of databases (also includes the general settings) -->
+ <!--settings src="solr.*.xml"/-->
+
+
<icu_chain id="relevance" locale="en">
<transform rule="[:Control:] Any-Remove"/>
<tokenize rule="l"/>
<!-- rank cluster="yes" lead="1" length="log" debug="no"/ Autographics settings-->
<rank cluster="yes" lead="1" length="log" debug="yes"/>
+
<!-- we try to keep same order as in marc21.xsl -->
<metadata name="id" brief="yes"/>
<metadata name="lccn" merge="unique"/>
<metadata name="due"/>
<metadata name="thumburl" brief="yes" merge="unique"/>
- <metadata name="score" brief="yes" sortkey="numeric" merge="range"/>
+ <!--metadata name="score" brief="yes" sortkey="numeric" merge="range"/-->
+ <metadata name="score" brief="yes" />
+
</service>
</server>
else
Q=$1
fi
+
+if [ -z "$2" ]
+then
+ HEADLINE="$Q"
+else
+ HEADLINE="$2: $Q"
+fi
+
QRY=`echo $Q | sed 's/ /+/g' `
-SORT="sort=score"
+#SORT="sort=score"
+SORT="sort=relevance_h"
#SEARCH="command=search$SES&$QRY&rank=1&sort=relevance"
#SEARCH="command=search$SES&$QRY"
#SEARCH="command=search$SES&query=$QRY&sort=relevance"
curl -s "http://localhost:9017/?$SHOW" > show.out
#grep "relevance" show.out | grep += | grep -v "(0)"
#grep "round-robin" show.out
-grep '^ <md-title>' show.out | head -11
-grep 'Received' dbc-opensearch-gw.log | head -1 >> titles.out
-grep '^ <md-title>' show.out >> titles.out
+
+# Plot the lines created by the code
+grep plotline show.out > scores.data
+echo "Client numbers"
+cat scores.data | cut -d' ' -f2 | sort -u
+head -10 scores.data
+
+echo "
+ set term png
+ set out \"plot.png\"
+ set title \"$HEADLINE\"
+" > plot.cmd
+echo '
+ plot "scores.data" using 0:($2==0?$6:1/0) with points title "db-1", \
+ "scores.data" using 0:($2==1?$6:1/0) with points title "db-2", \
+ "scores.data" using 0:($2==2?$6:1/0) with points title "db-3", \
+ "scores.data" using 0:($2==3?$6:1/0) with points title "db-4", \
+ "scores.data" using 0:($2==4?$6:1/0) with points title "db-5", \
+ "scores.data" using 0:($2==5?$6:1/0) with points title "db-6" \
+' >> plot.cmd
+cat plot.cmd | gnuplot
+
+
+exit 1 # The old plotting code
# Plot it
DF=`echo $QRY | sed 's/@//g' | sed 's/[+"]/_/g' | sed s"/'//g "`
r->doc_frequency_vec[0]++;
}
+static const char *getfield(struct record *bestrecord, const char *tag)
+{
+ struct session *se = client_get_session(bestrecord->client);
+ int md_field_id = conf_service_metadata_field_id(se->service, tag);
+ struct record_metadata *md = 0;
+ if (md_field_id <0)
+ return "";
+ md = bestrecord->metadata[md_field_id];
+ if ( md)
+ return md->data.text.disp;
+ return "";
+}
+
// Prepare for a relevance-sorted read
void relevance_prepare_read(struct relevance *rel, struct reclist *reclist,
enum conf_sortkey_type type)
int thisclient = 0;
struct record *bestrecord = 0;
int nclust = 0;
+ // Find the best record in a cluster - the one with lowest position
for (record = rec->records; record; record = record->next) {
if ( bestrecord == 0 || bestrecord->position < record->position )
bestrecord = record;
- nclust++;
+ nclust++; // and count them all, for logging
}
+ // find the client number for the record (we only have a pointer
while ( clients[thisclient] != 0
&& clients[thisclient] != bestrecord->client )
thisclient++;
yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client);
clients[thisclient] = bestrecord->client;
}
- int tfrel = relevance;
- relevance = -(bestrecord->position * n_clients + thisclient) ;
+ // Calculate a round-robin score
+ int tfrel = relevance; // keep the old tf/idf score
+ int robinscore = -(bestrecord->position * n_clients + thisclient) ;
wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n",
bestrecord->position, thisclient, nclust, tfrel, relevance );
yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d",
bestrecord->position, thisclient, nclust, relevance );
+
+ // Check if the record has a score field
+ const char *score = getfield(bestrecord,"score");
+ int solrscore = 10000.0 * atof(score);
+ const char *id = getfield(bestrecord, "id");
+ // clear the id, we only want the first numerical part
+ char idbuf[64];
+ i=0;
+ while( id[i] >= '0' && id[i] <= '9' ) {
+ idbuf[i] = id[i];
+ i++;
+ }
+ idbuf[i] = '\0';
+
+ const char *title = getfield(bestrecord, "title");
+ wrbuf_printf(w,"plotline: %d %d %d %d %d # %s %s\n",
+ thisclient, bestrecord->position,
+ tfrel, robinscore, solrscore, idbuf, title );
+ relevance = solrscore;
}
rec->relevance_score = relevance;
}