Better duplicate detection in local database

This incorporates much better duplicate detection in the local database, using both RFC-822 Message-ID as well as IMAP metadata (internaldate, RFC822 size) to determine if a message is already stored in the database.  Very useful when a message is stored in multiple folders, or an already-downloaded message is returned to a folder it originated in (i.e. INBOX).

Also some minor fixes to listing email by EmailIdentifier which save a roundtrip to the server for certain edge cases.
This commit is contained in:
Jim Nelson 2011-11-14 12:09:52 -08:00
parent d8fe58bf46
commit f5b7d29a8c
11 changed files with 214 additions and 239 deletions

View file

@ -223,10 +223,11 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
replay_queue.schedule(new ReplayAppend(this, total));
}
// Need to prefetch PROPERTIES (or, in the future NONE or LOCATION) fields to create a
// Need to prefetch at least an EmailIdentifier (and duplicate detection fields) to create a
// normalized placeholder in the local database of the message, so all positions are
// properly relative to the end of the message list; once this is done, notify user of new
// messages.
// messages. If duplicates, create_email_async() will fall through to an updated merge,
// which is exactly what we want.
//
// This MUST only be called from ReplayAppend.
private async void do_replay_appended_messages(int new_remote_count) {
@ -247,7 +248,7 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
// normalize starting at the message *after* the highest position of the local store,
// which has now changed
Gee.List<Geary.Email>? list = yield remote_folder.list_email_async(remote_count + 1, -1,
Geary.Email.Field.PROPERTIES, Geary.Folder.ListFlags.NONE, null);
local_folder.get_duplicate_detection_fields(), Geary.Folder.ListFlags.NONE, null);
assert(list != null && list.size > 0);
foreach (Geary.Email email in list) {
@ -351,6 +352,10 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
flags.is_any_set(Folder.ListFlags.FAST));
}
// TODO: A great optimization would be to fetch message "fragments" from the local database
// (retrieve all stored fields that match required_fields, although not all of required_fields
// are present) and only fetch the missing parts from the remote; to do this right, requests
// would have to be parallelized.
private async void do_list_email_async(int low, int count, Geary.Email.Field required_fields,
Gee.List<Geary.Email>? accumulator, EmailCallback? cb, Cancellable? cancellable,
bool local_only) throws Error {
@ -706,11 +711,18 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
high = initial_position;
}
int actual_count = (high - low + 1);
// low should never be -1, so don't need to check for that
low = low.clamp(1, int.MAX);
debug("do_list_email_by_id_async: initial_id=%s initial_position=%d count=%d actual_count=%d low=%d high=%d local_count=%d remote_count=%d",
int actual_count = ((high - low) + 1);
// one more check for exclusive listing
if (actual_count == 0 || (excluding_id && actual_count == 1))
return;
debug("do_list_email_by_id_async: initial_id=%s initial_position=%d count=%d actual_count=%d low=%d high=%d local_count=%d remote_count=%d excl=%s",
initial_id.to_string(), initial_position, count, actual_count, low, high, local_count,
remote_count);
remote_count, excluding_id.to_string());
yield do_list_email_async(low, actual_count, required_fields, accumulator, cb, cancellable,
local_only);
@ -724,6 +736,11 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
debug("Background fetching %d emails for %s", needed_by_position.length, to_string());
// Always get the flags for normalization and whatever the local store requires for duplicate
// detection
Geary.Email.Field full_fields =
required_fields | Geary.Email.Field.PROPERTIES | local_folder.get_duplicate_detection_fields();
Gee.List<Geary.Email> full = new Gee.ArrayList<Geary.Email>();
int index = 0;
@ -738,52 +755,16 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
list = needed_by_position;
}
// Always get the flags, and the generic end-user won't know to ask for them until they
// need them
Gee.List<Geary.Email>? remote_list = yield remote_folder.list_email_sparse_async(
list, required_fields | Geary.Email.Field.PROPERTIES, Geary.Folder.ListFlags.NONE,
cancellable);
list, full_fields, Geary.Folder.ListFlags.NONE, cancellable);
if (remote_list == null || remote_list.size == 0)
break;
// if any were fetched, store locally
// TODO: Bulk writing
foreach (Geary.Email email in remote_list) {
bool exists_in_system = false;
if (email.message_id != null) {
int count;
exists_in_system = yield local.has_message_id_async(email.message_id, out count,
cancellable);
}
bool exists_in_folder = yield local_folder.is_email_associated_async(email,
cancellable);
// NOTE: Although this looks redundant, this is a complex decision case and laying
// it out like this helps explain the logic. Also, this code relies on the fact
// that update_email_async() is a powerful call which might be broken down in the
// future (requiring a duplicate email be manually associated with the folder,
// for example), and so would like to keep this around to facilitate that.
if (!exists_in_system && !exists_in_folder) {
// This case indicates the email is new to the local store OR has no
// Message-ID and so a new copy must be stored.
yield local_folder.create_email_async(email, cancellable);
} else if (exists_in_system && !exists_in_folder) {
// This case indicates the email has been (partially) stored previously but
// was not associated with this folder; update it (which implies association)
yield local_folder.update_email_async(email, false, cancellable);
} else if (!exists_in_system && exists_in_folder) {
// This case indicates the message doesn't have a Message-ID and can only be
// identified by a folder-specific ID, so it can be updated in the folder
// (This may result in multiple copies of the message stored locally.)
yield local_folder.update_email_async(email, true, cancellable);
} else if (exists_in_system && exists_in_folder) {
// This indicates the message is in the local store and was previously
// associated with this folder, so merely update the local store
yield local_folder.update_email_async(email, false, cancellable);
}
}
foreach (Geary.Email email in remote_list)
yield local_folder.create_email_async(email, cancellable);
if (cb != null)
cb(remote_list, null);
@ -824,7 +805,7 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
Geary.Email email = yield remote_folder.fetch_email_async(id, fields, cancellable);
// save to local store
yield local_folder.update_email_async(email, false, cancellable);
yield local_folder.create_email_async(email, cancellable);
return email;
}
@ -869,13 +850,10 @@ private class Geary.EngineFolder : Geary.AbstractFolder {
debug("prefetching %d (%d) for %s (local_low=%d)", high, prefetch_count, to_string(),
local_low);
// Use PROPERTIES as they're the most useful information for certain actions (such as
// finding duplicates when we start using INTERNALDATE and RFC822.SIZE) and cheap to fetch
//
// TODO: Consider only fetching their UID; would need Geary.Email.Field.LOCATION (or
// perhaps NONE is considered a call for just the UID).
// Normalize the local folder by fetching EmailIdentifiers for all missing email as well
// as fields for duplicate detection
Gee.List<Geary.Email>? list = yield remote_folder.list_email_async(high, prefetch_count,
Geary.Email.Field.PROPERTIES, Geary.Folder.ListFlags.NONE, cancellable);
local_folder.get_duplicate_detection_fields(), Geary.Folder.ListFlags.NONE, cancellable);
if (list == null || list.size != prefetch_count) {
throw new EngineError.BAD_PARAMETERS("Unable to prefetch %d email starting at %d in %s",
count, low, to_string());

View file

@ -91,7 +91,8 @@ private class Geary.GenericImapFolder : Geary.EngineFolder {
cancellable);
if (newest != null && newest.size > 0) {
debug("saving %d newest emails in %s", newest.size, to_string());
debug("saving %d newest emails starting at %s in %s", newest.size, uid_start.to_string(),
to_string());
foreach (Geary.Email email in newest) {
try {
yield local_folder.create_email_async(email, cancellable);
@ -159,8 +160,7 @@ private class Geary.GenericImapFolder : Geary.EngineFolder {
if (remote_uid.value == local_uid.value) {
// same, update flags and move on
try {
yield imap_local_folder.update_email_async(old_remote[remote_ctr], true,
cancellable);
yield local_folder.create_email_async(old_remote[remote_ctr], cancellable);
} catch (Error update_err) {
debug("Unable to update old email in %s: %s", to_string(), update_err.message);
}

View file

@ -10,23 +10,19 @@ private interface Geary.LocalAccount : Object, Geary.Account {
public abstract async void update_folder_async(Geary.Folder folder, Cancellable? cancellable = null)
throws Error;
/**
* Returns true if the email (identified by its Message-ID) already exists in the account's
* local store, no matter the folder.
*
* Note that there are no guarantees of the uniqueness of a Message-ID, or even that a message
* will have one. Because of this situation the method can return the number of messages
* found with that ID.
*/
public async abstract bool has_message_id_async(Geary.RFC822.MessageID message_id,
out int count, Cancellable? cancellable = null) throws Error;
}
private interface Geary.LocalFolder : Object, Geary.Folder {
public async abstract bool is_email_present_async(Geary.EmailIdentifier id,
out Geary.Email.Field available_fields, Cancellable? cancellable = null) throws Error;
/**
* Returns the Geary.Email.Field bitfield of all email fields that must be requested from the
* remote folder in order to do proper duplicate detection within the local folder. May
* return Geary.Email.Field.NONE if no duplicate detection is available.
*/
public abstract Geary.Email.Field get_duplicate_detection_fields();
/**
* Converts an EmailIdentifier into positional addressing in the Folder. This call relies on
* the fact that when a Folder is fully opened, the local stores' tail list of messages (the
@ -41,47 +37,5 @@ private interface Geary.LocalFolder : Object, Geary.Folder {
*/
public async abstract int get_id_position_async(Geary.EmailIdentifier id, Cancellable? cancellable)
throws Error;
/**
* Geary allows for a single message to exist in multiple folders. This method checks if the
* email is associated with this folder. It may rely on a Message-ID being present, in which
* case if it's not the method will throw an EngineError.INCOMPLETE_MESSAGE.
*
* If the email is not in the local store, this method returns false.
*/
public async abstract bool is_email_associated_async(Geary.Email email, Cancellable? cancellable = null)
throws Error;
/**
* Geary allows for a single message to exist in multiple folders. It also allows for partial
* email information to be stored and updated, building the local store as more information is
* downloaded from the server.
*
* update_email_async() updates the email's information in the local store, adding any new
* fields not already present. If the email has fields already stored, the local version *will*
* be overwritten with this new information. However, if the email has fewer fields than the
* local version, the old information will not be lost. In this sense this is a merge
* operation.
*
* update_email_async() will also attempt to associate an email existing in the system with this
* folder. If the message has folder-specific properties that identify it, those will be used;
* if not, update_email_async() will attempt to use the Message-ID. If the Message-ID is not
* available in the email, it will throw EngineError.INCOMPLETE_MESSAGE unless
* duplicate_okay is true, which confirms that it's okay to not attempt the linkage (which
* should be done if the message simply lacks a Message-ID).
* TODO: Examine other fields in the email and attempt to match it with existing messages.
*
* The EmailLocation field is used to position the email in the folder's ordering.
* If another email exists at the same EmailLocation.position, EngineError.ALREADY_EXISTS
* will be thrown.
*
* If the email does not exist in the local store OR the email has no Message-ID and
* no_incomplete_error is true OR multiple messages are found in the system with the same
* Message-ID, update_email-async() will see if there's any indication of the email being
* associated with the folder. If so, it will merge in the new information. If not, this
* method will fall-through to create_email_async().
*/
public async abstract void update_email_async(Geary.Email email, bool duplicate_okay,
Cancellable? cancellable = null) throws Error;
}