2008-05-21T16:37:00Z
500
The UCD resolver uses the metadata in the current
Registry to suggest UCDs pertinent to natural language column descriptions.
In that, it fulfills a similar function as the `CDS UCD builder`_
but uses an entirely different approach.
.. _CDS UCD builder: http://cdsweb.u-strasbg.fr/UCD/cgi-bin/descr2ucd
virtual-observatories
http://www.ivoa.net/Documents/latest/UCDlist.html
Now using the relational registry as the data basis rather than
VizieR columns exclusively. In consequence, scoring is no longer
based on the old naive Bayesian scheme but on Postgres' full text
search.
Also enabled the api renderer.
return T.form(action="")[T.button(type="button",
onclick="toggleDescriptions(this.parentNode, "
"'/ucds/ui/known/form?ucd=%s&__nevow_form__=genForm')"
%urllib.parse.quote(data))[
"Show known descriptions"]]
"""
SELECT sum(log(1/(weight+1e-10))) AS score, ucd FROM (
SELECT ts_rank_cd(
to_tsvector('english', column_description),
plainto_tsquery('english', %%(inDescr)s)) AS weight, ucd
FROM rr.table_column
WHERE
ucd IS NOT NULL
%(constraint)s
GROUP BY ucd
ORDER BY score DESC
LIMIT 20"""
def normalizeColumn(rows, key, newMax):
curMax = max(r[key] for r in rows)
if curMax>0:
normalizer = newMax/float(curMax)*1e6
for r in rows:
r[key] = round(r[key]*normalizer)/1e6
ucdexplain, _ = utils.loadPythonModule(
os.path.join(rd.resdir, "res", "ucdexplainer"))
def getMatchesFor(queryString, constraint):
rows = []
with base.getTableConn() as conn:
for d in conn.queryToDicts(
columnQuery%{"constraint": constraint},
{"inDescr": queryString}):
try:
d["expl"] = ucdexplain.explain(d["ucd"])
except:
d["expl"] = "?"
d["toExpl"] = d["ucd"]
d["is_valid"] = True
try:
astropyucd.parse_ucd(d["ucd"],
check_controlled_vocabulary=True,
has_colon=False)
except ValueError:
d["is_valid"] = False
rows.append(d)
return rows
# there's no index-friendly global ranking against incoming
# document vectors in postgres as far as I can tell.
# So, to be robust against "odd" terms and still avoid
# to regularly seqscan all of rr.table_columns, I first
# try the incoming string as-is. If that doesn't yield
# anything, the condition is loosened to "any of"
tokens = inputTable.getParam("description").split()
rows = getMatchesFor(" ".join(tokens),
"AND to_tsvector('english', column_description)"
" @@ plainto_tsquery('english', %(inDescr)s)) AS q")
if not rows:
rows = getMatchesFor("|".join(tokens),
"AND to_tsvector('english', column_description)"
" @@ to_tsquery('english', %(inDescr)s)) AS q")
if not rows:
raise base.ValidationError("No matches for these terms."
" Perhaps try adding a few additional words?",
"description")
normalizeColumn(rows, "score", 50)
rows.sort(key=lambda r: r["score"], reverse=True)
return rsc.TableForDef(self.outputTable, rows=rows)
ucdguess
UCD resolver
res/query.template
knownucds
Descriptions of known UCDs
res/valuelist.template
if data is None:
return "N/A"
return data
ui/form
self.assertHasStrings("Column Description",
"No matches for these terms")
ui/form
self.assertHasStrings("Score</th",
"Show known", "toggleDescriptions", "✓")
known/form
self.assertHasStrings("Airmass at mean epoch",
"Air mass at center of observation")
ui/api
self.assertHasStrings(
'50.0,phot.mag;em.ir.k,Photometric magnitude in',
'True,phot.mag;em.ir.k')
/rr/q/pmh/pubreg.xml
ivo://org.gavo.dc/ucds/ui/ui',
'xsi:type="vs:CatalogService"',
'ucdguess',
'')
]]>
/rr/q/pmh/pubreg.xml
ivo://org.gavo.dc/ucds/ui/ui',
'',
'GAVO Data Center')
]]>
/rr/q/pmh/pubreg.xml
self.assertValidatesXSD()
/rr/q/pmh/pubreg.xml
self.assertValidatesXSD()