diff --unified --recursive --new-file linux-2.4.20/arch/i386/config.in linux-2.4.20-rbed/arch/i386/config.in --- linux-2.4.20/arch/i386/config.in 2002-11-28 15:53:09.000000000 -0800 +++ linux-2.4.20-rbed/arch/i386/config.in 2003-04-13 15:48:47.000000000 -0700 @@ -207,7 +207,7 @@ bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP if [ "$CONFIG_SMP" != "y" ]; then - bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC + bool 'APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC if [ "$CONFIG_X86_UP_APIC" = "y" ]; then define_bool CONFIG_X86_LOCAL_APIC y @@ -215,6 +215,12 @@ if [ "$CONFIG_X86_UP_IOAPIC" = "y" ]; then define_bool CONFIG_X86_IO_APIC y fi + if [ "$CONFIG_X86_UP_APIC" = "y" ]; then + bool 'Local APIC timer support' CONFIG_APIC_TIMER_UP + if [ "$CONFIG_APIC_TIMER_UP" = "y" ]; then + define_bool CONFIG_X86_LOCAL_APIC y + fi + fi else bool 'Multiquad NUMA system' CONFIG_MULTIQUAD fi @@ -303,6 +309,12 @@ if [ "$CONFIG_ACPI" != "n" ]; then source drivers/acpi/Config.in fi + if [ "$CONFIG_X86_UP_APIC" = "y" ]; then + bool 'Local APIC timer support' CONFIG_APIC_TIMER_UP + if [ "$CONFIG_APIC_TIMER_UP" = "y" ]; then + define_bool CONFIG_X86_LOCAL_APIC y + fi + fi fi dep_tristate ' Advanced Power Management BIOS support' CONFIG_APM $CONFIG_PM diff --unified --recursive --new-file linux-2.4.20/arch/i386/kernel/apic.c linux-2.4.20-rbed/arch/i386/kernel/apic.c --- linux-2.4.20/arch/i386/kernel/apic.c 2002-11-28 15:53:09.000000000 -0800 +++ linux-2.4.20-rbed/arch/i386/kernel/apic.c 2003-08-28 17:17:05.000000000 -0700 @@ -10,6 +10,7 @@ * for testing these extensively. * Maciej W. Rozycki : Various updates and fixes. * Mikael Pettersson : Power Management for UP-APIC. + * Vincent Oberle : APIC Timer module support */ #include @@ -1047,6 +1048,25 @@ */ } + +#ifdef CONFIG_APIC_TIMER_UP + +/* + * Pointer on a function to be called + * at each local APIC timer interrupt (not in SMP). + */ +void (*apic_timer_up_handler)(void); + +/* + * Sets the APIC local timer UP handler. + */ +void set_apic_timer_up_handler (void (*f)(void)) +{ + apic_timer_up_handler = f; +} +#endif /* CONFIG_APIC_TIMER_UP */ + + /* * Local APIC timer interrupt. This is the most natural way for doing * local interrupts, but local timer interrupts can be emulated by @@ -1071,6 +1091,12 @@ * because timer handling can be slow. */ ack_APIC_irq(); + +#ifdef CONFIG_APIC_TIMER_UP + if (apic_timer_up_handler != NULL) { + apic_timer_up_handler(); + } +#else /* * update_process_times() expects us to have done irq_enter(). * Besides, if we don't timer interrupts ignore the global @@ -1079,6 +1105,7 @@ irq_enter(cpu, 0); smp_local_timer_interrupt(regs); irq_exit(cpu, 0); +#endif if (softirq_pending(cpu)) do_softirq(); @@ -1172,5 +1199,8 @@ #endif setup_APIC_clocks(); + /*added by caixue lin on 04/14/03*/ + setup_uka_apic_timer(); + return 0; } diff --unified --recursive --new-file linux-2.4.20/arch/i386/kernel/entry.S linux-2.4.20-rbed/arch/i386/kernel/entry.S --- linux-2.4.20/arch/i386/kernel/entry.S 2002-11-28 15:53:09.000000000 -0800 +++ linux-2.4.20-rbed/arch/i386/kernel/entry.S 2004-02-04 18:43:15.000000000 -0800 @@ -264,7 +264,7 @@ ALIGN reschedule: - call SYMBOL_NAME(schedule) # test + call SYMBOL_NAME(user_schedule) # test jmp ret_from_sys_call ENTRY(divide_error) @@ -402,6 +402,31 @@ pushl $ SYMBOL_NAME(do_spurious_interrupt_bug) jmp error_code +#ifdef CONFIG_KGDB_THREAD +ENTRY(kern_schedule) + pushl %ebp + movl %esp, %ebp + pushl %ss + pushl %ebp + pushfl + pushl %cs + pushl 4(%ebp) + pushl %eax + pushl %es + pushl %ds + pushl %eax + pushl (%ebp) + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + call kern_do_schedule + movl %ebp, %esp + pop %ebp + ret +#endif + .data ENTRY(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* 0 - old "setup()" system call*/ @@ -657,6 +682,14 @@ .long SYMBOL_NAME(sys_ni_syscall) /* 250 sys_alloc_hugepages */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_free_hugepages */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_exit_group */ + .long SYMBOL_NAME(sys_test_oneshot) /* sys_etst_oneshot */ + .long SYMBOL_NAME(sys_rbed_schedule) /* RBED scheduler sys call*/ + .long SYMBOL_NAME(sys_rbed_deadline_met) /*RBED deadline check*/ + .long SYMBOL_NAME(sys_rbed_rt_tracedump) /*RBED RT trace dump*/ + .long SYMBOL_NAME(sys_rbed_tracedump) /*RBED trace dump*/ + .long SYMBOL_NAME(sys_rate_adjust) /*get current actual period*/ + .long SYMBOL_NAME(sys_getperiod) /*RBED get actual period*/ + .long SYMBOL_NAME(sys_setqos) /*RBED set qos flag*/ .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff --unified --recursive --new-file linux-2.4.20/arch/i386/kernel/i386_ksyms.c linux-2.4.20-rbed/arch/i386/kernel/i386_ksyms.c --- linux-2.4.20/arch/i386/kernel/i386_ksyms.c 2002-08-02 17:39:42.000000000 -0700 +++ linux-2.4.20-rbed/arch/i386/kernel/i386_ksyms.c 2003-04-13 15:48:48.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include extern void dump_thread(struct pt_regs *, struct user *); extern spinlock_t rtc_lock; @@ -75,6 +76,12 @@ EXPORT_SYMBOL(gdt); EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_APIC_TIMER_UP +#warning APIC_TIMER_UP is set +EXPORT_SYMBOL(set_apic_timer_up_handler); +#endif + #ifdef CONFIG_DEBUG_IOVIRT EXPORT_SYMBOL(__io_virt_debug); #endif diff --unified --recursive --new-file linux-2.4.20/arch/i386/kernel/Makefile linux-2.4.20-rbed/arch/i386/kernel/Makefile --- linux-2.4.20/arch/i386/kernel/Makefile 2002-11-28 15:53:09.000000000 -0800 +++ linux-2.4.20-rbed/arch/i386/kernel/Makefile 2003-04-14 11:50:43.000000000 -0700 @@ -37,7 +37,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o apic.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o apic.o nmi.o uka_apic_timer.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o acpitable.o obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o diff --unified --recursive --new-file linux-2.4.20/arch/i386/kernel/uka_apic_timer.c linux-2.4.20-rbed/arch/i386/kernel/uka_apic_timer.c --- linux-2.4.20/arch/i386/kernel/uka_apic_timer.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.20-rbed/arch/i386/kernel/uka_apic_timer.c 2004-02-10 21:22:17.000000000 -0800 @@ -0,0 +1,931 @@ +/* -*- linux-c -*- + * uka_apic_timer.c + * + * Module providing precise timers using the Local APIC timer. + * + * Copyright (C) 2000 Vincent Oberle (vincent@oberle.com) + * Institute of Telematics, University of Karlsruhe, Germany. + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file COPYING in the main directory + * of this archive for more details. + */ + +/* + * IMPORTANT + * --------- + * + * The kernel must be patched, see patch directory + * Have a look in the config_apic_timer.h file. + * The kernel version is indicated in the KERNEL_VERSION file. + * + * + * Race conditions / disabling interrupts + * -------------------------------------- + * + * We disable interrupts whenever the timer list is being modified. + * That is stronger that the precedent implementation, which only + * protected the timer list modifications from being interrupted + * by an APIC timer interrupt. + * Basically the goal is to make sure that the add_apic_timer(), + * mod_apic_timer() and del_apic_timer() are mutually exclusive. + * + * The timer functions themselves still run with interrupts enabled. + * + * Note that disabling interrupts can really impact system performance + * and a better solution may have to be found. + * + * One possible race condition that can occur is (thanks to Ryan Barnett + * for pointing it out): + * "Suppose you have two threads of execution, one done from a user + * system call, and another done from an interrupt handler. They both want to + * do "add_apic_timer()". The user system call first starts... It calls + * APIC_TIMER_LOCK then it does insert_apic_timer(timer). Then it gets to + * the middle of insert_apic_timer() where it is setting the previous timer + * to NULL and messing with the timer list. At exactly that moment, an + * interrupt occurs and the user code is halted. Then the new thread of + * execution starts and does add_apic_timer(), it then gets messed up by the + * half-modified timer list. Eventually the old thread will begin again and + * now it will get really screwed up, possibly dereferencing an invalid + * pointer crashing the kernel. + * The same race condition could be applied if you have two user-threads that + * are making system calls that both want to add_del_timer()." + * + * + * Cache misses influence? + * ----------------------- + * + * What is the influence of cache miss? + * If we consider that a cache miss that forces a main RAM read + * needs a lot of cycles (-> 80), they can have a lot of influence + * if we try to have a "cycle" precision. + * But this is quite difficult to solve (maybe with some intelligent + * "dummy" reads...) + */ + + +#include +#include +#include +#include +#include +/* Access to the RTC (outb_p, inb_p), for autocalibrating */ +#include +/* Warning less compiling */ +#include + +#include + +/* Access to machine-specific registers (rdmsr...) */ +#include +/* All APIC stuff */ +#include +/* For LOCAL_TIMER_VECTOR */ +#include +/* 64-bit division. */ +#include + +/* kmalloc, kfree */ +#include + +#include + +/*flag to notify the kernel the apic timer is ready to use*/ +int apic_timer_setup_flag = 0; + +void print_apic_timer (struct apic_timer_list *); + +/* File with defines for customizing of the module */ +//#include "config_apic_timer.h" + + +//MODULE_AUTHOR("Vincent Oberle"); +//MODULE_DESCRIPTION("Precise timers module using the local APIC timer"); + + + +/*** init code ***/ + +/* + * Adds the supplementary timer interrupt handler and + * unmasks the timer interrupt. + * Returns 0 if ok. + * Not to be used outside the module. See add_apic_timer instead. + */ +int request_apic_timer_irq (void (*supp_handler)(void)) +{ + printk(KERN_INFO "Add the supp handler to the APIC timer handler\n"); + + /* Here, we enable (unmask) the local APIC timer interrupt */ + apic_write(APIC_LVTT, LOCAL_TIMER_VECTOR); + + /* Divide configuration register */ + apic_write(APIC_TDCR, APIC_TDR_DIV_1); + + set_apic_timer_up_handler(supp_handler); + + return 0; +} + + +/* + * Processor and bus frequencies. + * Needed because the APIC timer is based on the bus freq + * and we want to set it with a TSC value which is based on + * the processor freq. + */ +unsigned long proc_freq; +unsigned long bus_freq; + +/* + * Following functions are copied from apic.c + * Some cannot be exported since they're __init + */ +static unsigned int get_8254_timer_count_uka(void) +{ + extern spinlock_t i8253_lock; + unsigned long flags; + + unsigned int count; + + spin_lock_irqsave(&i8253_lock, flags); + + outb_p(0x00, 0x43); + count = inb_p(0x40); + count |= inb_p(0x40) << 8; + + spin_unlock_irqrestore(&i8253_lock, flags); + + return count; +} + +void wait_8254_wraparound_uka(void) +{ + unsigned int curr_count, prev_count=~0; + int delta; + + curr_count = get_8254_timer_count_uka(); + + do { + prev_count = curr_count; + curr_count = get_8254_timer_count_uka(); + delta = curr_count-prev_count; + } while (delta < 300); +} + +#define APIC_DIVISOR 16 + +void setup_APIC_LVTT_uka(unsigned int clocks) +{ + unsigned int lvtt1_value, tmp_value; + + lvtt1_value = SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV) | + APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + apic_write_around(APIC_LVTT, lvtt1_value); + + /* Divide PICLK by 16 */ + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + apic_write_around(APIC_TMICT, clocks / APIC_DIVISOR); +} + +/* + * Calibration code. + * This function initializes proc_freq and bus_freq. + * + * Similar to the code in arch/i386/kernel/apic.c + * int __init calibrate_APIC_clock(void) + */ +void calibrate_apic_timer (void) +{ + /* For the processor clock */ + unsigned long long t1 = 0, t2 = 0; + /* For the bus clock */ + unsigned long tt1, tt2; + + int i; + const int LOOPS = HZ/10; + + printk(KERN_INFO "Calibrating APIC timer\n"); + + if (!cpu_has_tsc) { + printk(KERN_WARNING "CPU has no TSC, " + "cannot calibrate the APIC timer\n"); + return; + } + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + */ + setup_APIC_LVTT_uka(1000000000); + + /* Wraparound to start exact measurement */ + wait_8254_wraparound_uka(); + + rdtscll(t1); + tt1 = apic_read(APIC_TMCCT); + + /* + * Let's wait LOOPS wraprounds: + */ + for (i = 0; i < LOOPS; i++) + wait_8254_wraparound_uka(); + + tt2 = apic_read(APIC_TMCCT); + rdtscll(t2); + + proc_freq = (long)(t2 - t1); + /* bus_freq = tt1 - tt2; */ + bus_freq = (tt1 - tt2) * APIC_DIVISOR; + + printk(KERN_INFO "Proc freq %ld\n", proc_freq / LOOPS); + printk(KERN_INFO "Bus freq %ld\n", bus_freq / LOOPS); +} + +/*** end of init code ***/ + + + +/* + * Error adjustment: + * tsc_error saves the value of the error for the precedent timer + * issued. This value is used to correct the programing for the next timer. + */ +unsigned long tsc_error; + + + +#ifdef APIC_TIMER_TRACE_EXE + +/* + * Array to store the function called. + * If they are more trace than the size of the array, they are + * NOT recorded. + */ +#define APIC_TIMER_TRACE_ARRAY_SIZE 1000 +static char *trace_array[APIC_TIMER_TRACE_ARRAY_SIZE]; +static int trace_index; + +#define BUILD_TRACE_NAME(n) static char *trace_name_##n = #n; +BUILD_TRACE_NAME(start) BUILD_TRACE_NAME(detach) +BUILD_TRACE_NAME(detach_first) BUILD_TRACE_NAME(insert) +BUILD_TRACE_NAME(run) BUILD_TRACE_NAME(add) +BUILD_TRACE_NAME(del) BUILD_TRACE_NAME(mod) +BUILD_TRACE_NAME(exe) BUILD_TRACE_NAME(do_irq) +#undef BUILD_TRACE_NAME +#define APIC_TIMER_TRACE(n) if (trace_index < APIC_TIMER_TRACE_ARRAY_SIZE) \ + trace_array[trace_index++] = trace_name_##n; + +#else +#define APIC_TIMER_TRACE(n) +#endif /* APIC_TIMER_TRACE_EXE */ + + + +#ifdef APIC_TIMER_ERROR_STAT + +/* + * Error statistics. + */ +unsigned long min_error, max_error; +unsigned long long total_error; +unsigned long nb_error_measure; + +#endif /* APIC_TIMER_ERROR_STAT */ + + + +/* + * Starts the APIC timer. + * The value in parameter is in "processor clocks" units. + * It is the value of the TSC when we want to execute the timer function. + * + * The function returns an integer: + * "0" means the timer should be issued immediately because it has expired, + * "1" means the APIC timer was set. + */ +inline int start_apic_timer (unsigned long long value) +{ + unsigned long eax, edx; + long long exp; /* Signed! */ + unsigned long long apic_tmict; /* For the 64-bits division */ + + APIC_TIMER_TRACE(start); + + rdtsc(eax, edx); + exp = value - TO_ULL(edx, eax); + /* printk("Val to prog in APIC before correction \t%d (TSC)\n", exp); */ + + /* Error adjustment */ + exp -= tsc_error; + + /* + * It is possible to try to program a timer in the past (exp < 0) + * if the timer expired since we called start_apic_timer. + */ + if (exp <= 0) + return 0; /* Timer expired */ + + /* + * If the timer is too far in the future, ie the value cannot + * be programmed in the 32-bit APIC timer, we programm it before, + * knowing the timer will be automatically reprogrammed than. + */ + if (exp > 0xFFFFFFFF) { + /* printk(KERN_INFO "apic: too large exp value, %ld%ld\n", + ULL_HIGH(exp), ULL_LOW(exp)); */ + exp = 0xFFFFFFFF; + } + + /* + * Conversion in "bus units" for the APIC timer. + * + * Accurate 64-bit division. Thanks to Ryan Barnett. + * do_div put the result of the division in the first param and + * returns the remainder + */ + apic_tmict = (unsigned long long) exp * (unsigned long long)bus_freq; + if (do_div(apic_tmict, proc_freq)) + apic_tmict++; + + /* + * Starts the APIC timer. + */ + apic_write(APIC_TMICT, (unsigned long)apic_tmict); + + return 1; /* APIC Timer programmed */ +} + + + +/*** timer management functions ***/ + +/* + * The APIC timer list. + * We keep the list in order (ie with the "expires" field) + * to have the fastest execution possible (ie we only have + * to look at the beginning of the list to know which timers + * to execute). + * For the store, we look first at the end, because timers + * are more likely to be added in order (ie at the end of the list). + */ +struct apic_timer_list *apic_timer_start; +struct apic_timer_list *apic_timer_end; + + +/* + * Removes the timer from the list. + * + * Only list management, does nothing with time or APIC stuff. + * Does not modify the timer pointed by "timer" + * (no timer->next = timer->prev = NULL;) + * This function should be protected by a lock. + */ +int detach_apic_timer (struct apic_timer_list *timer) +{ + APIC_TIMER_TRACE(detach); + + if (!timer) { + printk(KERN_WARNING "timer null in detach_apic_timer\n"); + return 0; + } + + if (timer->prev) { + timer->prev->next = timer->next; + } else { + /* To check if the timer is really in the list */ + if (apic_timer_start == timer) + apic_timer_start = timer->next; + } + + if (timer->next) { + timer->next->prev = timer->prev; + } else { + if (apic_timer_end == timer) + apic_timer_end = timer->prev; + } + + return 1; +} + +/* + * Removes the first timer of the list. + * Faster than detach_apic_timer (even inlined). + * + * Only list management, does nothing with time or APIC stuff. + * Does not modify the timer pointed by "apic_timer_start" + * This function should be protected by a lock. + */ +inline int detach_first_apic_timer (void) +{ + APIC_TIMER_TRACE(detach_first); + + if (!apic_timer_start) { + printk(KERN_WARNING "apic_timer_start null in detach_first_apic_timer\n"); + return 0; + } + + apic_timer_start = apic_timer_start->next; + if (apic_timer_start) + apic_timer_start->prev = NULL; + else + apic_timer_end = NULL; + + return 1; +} + +/* + * Adds a timer to the list, starting from the end + * and ordering it with the expires field. + * + * Does not access the APIC. + * This function should be protected by a lock. + */ +void insert_apic_timer (struct apic_timer_list *timer) +{ + struct apic_timer_list *t_current = apic_timer_end; + unsigned long long t_expires = timer->expires; + + APIC_TIMER_TRACE(insert); + + if (!timer) { + printk(KERN_WARNING "insert_apic_timer timer null\n"); + return; + } + + /* + * Loop until we find the timer where the expires is lower + * or equal than the expires to search. + */ + while ((t_current) && (t_current->expires > t_expires)) + t_current = t_current->prev; + + /* Add the timer */ + if (t_current) { + /* The timer is added after t_current */ + timer->next = t_current->next; + timer->prev = t_current; + t_current->next = timer; + if (timer->next) /* ie old t_current->next */ + timer->next->prev = timer; + else + apic_timer_end = timer; + } else { + /* The timer must be added at the beginning */ + timer->next = apic_timer_start; + timer->prev = NULL; + if (apic_timer_start) /* List not empty */ + apic_timer_start->prev = timer; + else /* List empty */ + apic_timer_end = timer; + apic_timer_start = timer; + } +} + + +#define APIC_TIMER_ERR_ADJ_NO_UPDATE 0 +#define APIC_TIMER_ERR_ADJ_UPDATE 1 + +/* + * Indicates if we are in the run_apic_timer function. + * It is used in order not to reentrant in it again by calling + * it other times. + */ +static int in_run_apic_timer; + +/* + * Check if there are some timers to be issued. + * + * The flag parameter indicates if the error adjustment + * should be updated (APIC_TIMER_ERR_ADJ_UPDATE) + * or no (APIC_TIMER_ERR_ADJ_NO_UPDTATE) + * + * The locking is managed by the function. + * + * For each timer, detachs it and calls the timer function. + */ +void run_apic_timer (int flag) +{ + unsigned long eax, edx; + unsigned long long current_tsc; + struct apic_timer_list *t_current; + unsigned long tmp_tsc_error; + unsigned long flags; + + in_run_apic_timer = 1; + + restart_run_apic_timer: + + APIC_TIMER_TRACE(run); + + /*This statement added by caixue lin on 02/10/04*/ + //flag = APIC_TIMER_ERR_ADJ_NO_UPDATE; + + if (flag == APIC_TIMER_ERR_ADJ_UPDATE) + tmp_tsc_error = tsc_error; + else + tmp_tsc_error = 0; + + rdtsc(eax, edx); + current_tsc = TO_ULL(edx, eax); + + /* + * Detach the timer and call the timer functions + * of the timers to be issued. + */ + + while ((apic_timer_start) && ((apic_timer_start->expires - tmp_tsc_error) < current_tsc)) { + if (flag == APIC_TIMER_ERR_ADJ_UPDATE) { + rdtsc(eax, edx); + current_tsc = TO_ULL(edx, eax); + + tmp_tsc_error = DIFF_ABS(current_tsc, apic_timer_start->expires); + tsc_error = DIFF_ABS(tsc_error, tmp_tsc_error); + tmp_tsc_error = tsc_error; +#ifdef APIC_TIMER_ERROR_STAT + if (tsc_error < min_error) min_error = tsc_error; + if (max_error < tsc_error) max_error = tsc_error; + total_error += tsc_error; + nb_error_measure++; +#endif /* APIC_TIMER_ERROR_STAT */ + } + + t_current = apic_timer_start; /* ptr on the timer we're working on */ + + /* Detach timer - Protected by disabling interrupts. */ + local_irq_save(flags); + detach_first_apic_timer(); + /* here the list is modified, apic_timer_start points on the next timer */ + local_irq_restore(flags); + + t_current->next = t_current->prev = NULL; + + APIC_TIMER_TRACE(exe); + + /* + * Call the timer function. + * It is allowed to modify the timer list here, + * so we first enable interrupts. + */ + if (t_current->function != NULL) { + /* local_irq_enable(); */ + //printk(KERN_DEBUG "***before***Process: %d with timer: %llu\n, now: %llu\n",((struct task_struct *)(t_current->data))->pid, t_current->expires, current_tsc); + t_current->function(t_current->expires, t_current->data); + //printk(KERN_DEBUG "***after***Process: %d with timer: %llu\n, now: %llu\n",((struct task_struct *)(t_current->data))->pid, t_current->expires, current_tsc); + /* local_irq_disable(); */ + } + +#ifdef APIC_TIMER_FREE_ISSUED + printk(KERN_DEBUG "free timer????\n"); + kfree(t_current); +#endif + + rdtsc(eax, edx); + current_tsc = TO_ULL(edx, eax); + + /* apic_timer_start already points on the next timer */ + } + + /* Reprogram the APIC timer. */ + if ( (apic_timer_start) && + (!(start_apic_timer(apic_timer_start->expires))) ) { + flag = APIC_TIMER_ERR_ADJ_UPDATE; + goto restart_run_apic_timer; + } + + in_run_apic_timer = 0; +} + + +/* + * The function that is called by the APIC timer + * (see request_apic_timer_irq) + */ +void do_apic_irq (void) +{ + /* local_irq_disable(); */ + + APIC_TIMER_TRACE(do_irq); + + /* + * If the timer list is empty, do not do anything. + */ + if (!apic_timer_start) { + local_irq_enable(); + return; + } + + if (!in_run_apic_timer) { + run_apic_timer(APIC_TIMER_ERR_ADJ_UPDATE); + } + + /* local_irq_enable(); */ +} + + + +/* + * Adds an APIC timer + */ +int add_apic_timer (struct apic_timer_list *timer) +{ + unsigned long flags; + + //printk("Adding apic timer\n"); + APIC_TIMER_TRACE(add); + + if (!timer) { + return 0; /* A good usage of the module shouldn't need that... */ + } + + /* Supposes a clean management of timers... */ + if (timer->prev) { + //printk(KERN_WARNING "bug: kernel timer added twice at %p.\n", + printk("bug: kernel timer added twice at %p.\n", + __builtin_return_address(0)); + return 0; + } + + /* + * Insert the timer, protected by disabling interrupts. + */ + local_irq_save(flags); + insert_apic_timer(timer); + local_irq_restore(flags); + + /* + * Check if there are some timers to execute in the case were not + * already called from run_apic_timer (avoiding recursion). + */ + if (!in_run_apic_timer) { + run_apic_timer(APIC_TIMER_ERR_ADJ_NO_UPDATE); + } + + return 1; +} + + +/* + * Removes an APIC timer. + * If the next and prev pointers of the timer are not NULL, + * the timer is considered to be in the list (if it isn't, the + * list can be corrupted). + */ +int del_apic_timer (struct apic_timer_list *timer) +{ + int ret; + unsigned long flags; + + APIC_TIMER_TRACE(del); + + /* + * We do not check if we delete the first timer or not: + * if it is the first one, an interrupt will be generated, + * the list will be checked and probably no timer issued. + * The APIC timer will be reprogrammed to the first next timer. + */ + + if (!timer) { + return 0; + } + + /* Detach the timer. */ + local_irq_save(flags); + ret = detach_apic_timer(timer); + + /* + * It is a good idea (TM) to try to have next = prev = NULL + * for a timer that is not in the list. + */ + timer->next = timer->prev = NULL; + + local_irq_restore(flags); + + if (!in_run_apic_timer) { + run_apic_timer(APIC_TIMER_ERR_ADJ_NO_UPDATE); + } + + return ret; +} + + +/* + * More efficient way to update the expire field of an active timer + * (if the timer is inactive it will be activated). + * mod_apic_timer(a,b) is equivalent to: + * del_apic_timer(a); a->expires = b; add_apic_timer(a) + */ +int mod_apic_timer (struct apic_timer_list *timer, unsigned long long expires) +{ + int ret; + unsigned long flags; + + APIC_TIMER_TRACE(mod); + + if (!timer) { + return 0; + } + + /* + if (timer->prev) { + printk(KERN_WARNING "bug: kernel timer added twice at %p.\n", + __builtin_return_address(0)); + return -1; + } + */ + + /* Modify the timer. */ + local_irq_save(flags); + + timer->expires = expires; + ret = detach_apic_timer(timer); + insert_apic_timer(timer); + + local_irq_restore(flags); + + if (!in_run_apic_timer) { + run_apic_timer(APIC_TIMER_ERR_ADJ_NO_UPDATE); + } + + return ret; +} + + +/* + * Export the timer list management functions. + */ + +/* +EXPORT_SYMBOL(add_apic_timer); +EXPORT_SYMBOL(del_apic_timer); +EXPORT_SYMBOL(mod_apic_timer); + +//EXPORT_SYMBOL(apic_timer_start); // Useful for debugging +*/ + +/*** end of timer management functions ***/ + + +/*** procfs code ***/ + +#ifdef CONFIG_PROC_FS + +/* + * Note that calling this function while timers have to be + * issued may impact on performance and precision of the timers. + */ +static int apic_timer_proc_get_info (char *buffer, + char **start, + off_t offset, + int length) +{ + int len; + off_t pos = 0; + off_t begin = 0; + struct apic_timer_list *t_current = apic_timer_start; + unsigned long flags; +#ifdef APIC_TIMER_TRACE_EXE + int i = 0; +#endif +#ifdef APIC_TIMER_ERROR_STAT + unsigned long long av_error; +#endif + + len = sprintf(buffer, "APIC timer module for " UTS_RELEASE "\n"); + + local_irq_save(flags); + + if (!apic_timer_start) { + len += sprintf(buffer + len, "No timer left to be issued\n"); + goto lab_proc_get_info; + } + + len += sprintf(buffer + len, "Timer to be issued\n"); + while ((t_current) && (pos <= offset + length)) { + len += sprintf(buffer + len, "Timer with %lu %lu\n", + ULL_HIGH(t_current->expires), + ULL_LOW(t_current->expires)); + pos = begin + len; + if (pos < offset) { + len = 0; + begin = pos; + } + t_current = t_current->next; + } + + lab_proc_get_info: + +#ifdef APIC_TIMER_TRACE_EXE + len += sprintf(buffer + len, "Trace information\n"); + while ((i < trace_index) && (pos <= offset + length)) { + len += sprintf(buffer + len, "%s\n", trace_array[i]); + pos = begin + len; + if (pos < offset) { + len = 0; + begin = pos; + } + i++; + } +#endif + + local_irq_restore(flags); + +#ifdef APIC_TIMER_ERROR_STAT + if (nb_error_measure > 0) { + av_error = total_error; + do_div(av_error, nb_error_measure); + len += sprintf(buffer + len, "Min error: %lu\n", min_error); + len += sprintf(buffer + len, "Max error: %lu\n", max_error); + len += sprintf(buffer + len, "Average error: %lu\n", av_error); + } +#endif + + *start = buffer + (offset - begin); + len -= (offset - begin); + if (len > length) + len = length; + return len; +} + +#endif + +/*** end of procfs code ***/ + +void print_apic_timer (struct apic_timer_list *t_current) +{ + if (t_current) { + printk(KERN_DEBUG "Process: %d with timer: %llu\n",((struct task_struct *)(t_current->data))->pid, t_current->expires); + } +} + +void print_apic_timer_list (void) +{ + struct apic_timer_list *t_current = apic_timer_start; + printk("APIC timer list\n"); + while (t_current) { + print_apic_timer(t_current); + t_current = t_current->next; + } +} + +/*added by caixue lin on 04/14/03*/ +int setup_uka_apic_timer() +{ + printk("\n"); + printk(KERN_INFO "Init the UKA APIC timer \n"); + + /* + * Checks if there is a good APIC. + * If there is no local APIC found, the processor might not + * be a P6, or the option wasn't correctly activated in the kernel. + * This checking shoud be enough. + */ + if (!cpu_has_apic) + return -ENODEV; + + set_apic_timer_up_handler(NULL); + +#ifdef APIC_TIMER_TRACE_EXE + printk(KERN_INFO "APIC tracing activated\n"); +#endif + + /* + * We do the calibration two times since sometimes the first one gets + * a wrong bus clock value. + */ + calibrate_apic_timer(); + calibrate_apic_timer(); + + if (request_apic_timer_irq((void*)do_apic_irq) < 0) + return -ENODEV; + + tsc_error = 0; + +#ifdef APIC_TIMER_TRACE_EXE + trace_index = 0; +#endif +#ifdef APIC_TIMER_ERROR_STAT + min_error = 0xFFFFFFFF; + max_error = 0; + total_error = 0; + nb_error_measure = 0; +#endif + + apic_timer_start = apic_timer_end = NULL; + +#ifdef CONFIG_PROC_FS + if (!create_proc_info_entry("apic_timer", 0, NULL, apic_timer_proc_get_info)) + printk(KERN_ERR "uka_apic_timer: registering /proc/apic_timer failed\n"); +#endif + + /*setup the flag to notify the kernel the apic timer is ready to use*/ + apic_timer_setup_flag = 1; + + return 0; + +} diff --unified --recursive --new-file linux-2.4.20/Documentation/Configure.help linux-2.4.20-rbed/Documentation/Configure.help --- linux-2.4.20/Documentation/Configure.help 2002-11-28 15:53:08.000000000 -0800 +++ linux-2.4.20-rbed/Documentation/Configure.help 2003-04-13 15:49:02.000000000 -0700 @@ -256,28 +256,41 @@ IO-APIC support on uniprocessors CONFIG_X86_UP_IOAPIC - An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an - SMP-capable replacement for PC-style interrupt controllers. Most - SMP systems and a small number of uniprocessor systems have one. - If you have a single-CPU system with an IO-APIC, you can say Y here - to use it. If you say Y here even though your machine doesn't have - an IO-APIC, then the kernel will still run with no slowdown at all. + APIC (Advanced Programmable Interrupt Controller) is a scheme for + delivering hardware interrupt requests to the CPU. It is commonly + used on systems with several CPUs. If you have a single-CPU system + which has a processor that has an integrated APIC, you can say Y + here to enable and use it. If you say Y here even though your + machine doesn't have an APIC, then the kernel will still run with no + slowdown at all. The advantage of APIC support is the possibility + to use performance counters, and the APIC based NMI watchdog which + detects hard lockups. + + An IO-APIC is an SMP-capable replacement for PC-style interrupts + controllers, most SMP systems and a small number of uniprocessor + systems have these chips. Linux will try to detect and use this + chip, if it's not found then Linux falls back to PC-style interrupt + handling. - If you have a system with several CPUs, you do not need to say Y - here: the IO-APIC will be used automatically. - -Local APIC Support on Uniprocessors +APIC support on uniprocessors CONFIG_X86_UP_APIC - A local APIC (Advanced Programmable Interrupt Controller) is an - integrated interrupt controller in the CPU. If you have a single-CPU - system which has a processor with a local APIC, you can say Y here to - enable and use it. If you say Y here even though your machine doesn't - have a local APIC, then the kernel will still run with no slowdown at - all. The local APIC supports CPU-generated self-interrupts (timer, - performance counters), and the NMI watchdog which detects hard lockups. + APIC (Advanced Programmable Interrupt Controller) is a scheme for + delivering hardware interrupt requests to the CPU. It is commonly + used on systems with several CPUs. If you have a single-CPU system + which has a processor that has an integrated APIC, you can say Y + here to enable and use it. If you say Y here even though your + machine doesn't have an APIC, then the kernel will still run with no + slowdown at all. The advantage of APIC support is the possibility + to use performance counters, and the APIC based NMI watchdog which + detects hard lockups. + +Local APIC timer support +CONFIG_APIC_TIMER_UP + This option allows uniprocessor-kernels to use the timer of the + Local APIC. + + See also http://vincent.oberle.com/apic_timer.html - If you have a system with several CPUs, you do not need to say Y - here: the local APIC will be used automatically. Kernel math emulation CONFIG_MATH_EMULATION diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/apic.h linux-2.4.20-rbed/include/asm-i386/apic.h --- linux-2.4.20/include/asm-i386/apic.h 2002-08-02 17:39:45.000000000 -0700 +++ linux-2.4.20-rbed/include/asm-i386/apic.h 2003-09-26 18:22:15.000000000 -0700 @@ -94,6 +94,10 @@ #define NMI_LOCAL_APIC 2 #define NMI_INVALID 3 +#ifdef CONFIG_APIC_TIMER_UP +extern void set_apic_timer_up_handler (void (*f)(void)); +#endif + #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* __ASM_APIC_H */ diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/errno.h linux-2.4.20-rbed/include/asm-i386/errno.h --- linux-2.4.20/include/asm-i386/errno.h 2002-08-02 17:39:45.000000000 -0700 +++ linux-2.4.20-rbed/include/asm-i386/errno.h 2003-04-13 15:48:03.000000000 -0700 @@ -128,5 +128,6 @@ #define ENOMEDIUM 123 /* No medium found */ #define EMEDIUMTYPE 124 /* Wrong medium type */ +#define ENRT 125 /* cannot run process as real-time */ #endif diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/param.h linux-2.4.20-rbed/include/asm-i386/param.h --- linux-2.4.20/include/asm-i386/param.h 2000-10-27 11:04:43.000000000 -0700 +++ linux-2.4.20-rbed/include/asm-i386/param.h 2003-05-22 17:53:28.000000000 -0700 @@ -3,6 +3,7 @@ #ifndef HZ #define HZ 100 +//#define HZ 1000 #endif #define EXEC_PAGESIZE 4096 diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/processor.h linux-2.4.20-rbed/include/asm-i386/processor.h --- linux-2.4.20/include/asm-i386/processor.h 2002-08-02 17:39:45.000000000 -0700 +++ linux-2.4.20-rbed/include/asm-i386/processor.h 2003-09-26 18:22:15.000000000 -0700 @@ -385,6 +385,9 @@ /* IO permissions */ int ioperm; unsigned long io_bitmap[IO_BITMAP_SIZE+1]; +#ifdef CONFIG_X86_REMOTE_DEBUG + struct pt_regs *kgdbregs; +#endif }; #define INIT_THREAD { \ @@ -450,13 +453,14 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019]) #define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) - #define THREAD_SIZE (2*PAGE_SIZE) #define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) #define free_task_struct(p) free_pages((unsigned long) (p), 1) #define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count) #define init_task (init_task_union.task) +//tim b ? don't know if this is right +#define init_rt_task (init_task_union.task) #define init_stack (init_task_union.stack) struct microcode { diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/ptrace.h linux-2.4.20-rbed/include/asm-i386/ptrace.h --- linux-2.4.20/include/asm-i386/ptrace.h 2001-09-14 14:04:08.000000000 -0700 +++ linux-2.4.20-rbed/include/asm-i386/ptrace.h 2003-04-13 15:48:03.000000000 -0700 @@ -58,6 +58,7 @@ #define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs)) #define instruction_pointer(regs) ((regs)->eip) extern void show_regs(struct pt_regs *); +extern void show_registers(struct pt_regs *regs); #endif #endif diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/timex.h linux-2.4.20-rbed/include/asm-i386/timex.h --- linux-2.4.20/include/asm-i386/timex.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/include/asm-i386/timex.h 2003-09-26 18:22:15.000000000 -0700 @@ -55,4 +55,6 @@ #define vxtime_lock() do {} while (0) #define vxtime_unlock() do {} while (0) +extern unsigned long cpu_khz; + #endif diff --unified --recursive --new-file linux-2.4.20/include/asm-i386/unistd.h linux-2.4.20-rbed/include/asm-i386/unistd.h --- linux-2.4.20/include/asm-i386/unistd.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/include/asm-i386/unistd.h 2004-02-04 18:42:31.000000000 -0800 @@ -257,6 +257,14 @@ #define __NR_alloc_hugepages 250 #define __NR_free_hugepages 251 #define __NR_exit_group 252 +#define __NR_test_oneshot 253 +#define __NR_rbed_schedule 254 +#define __NR_rbed_deadline_met 255 +#define __NR_rbed_rt_tracedump 256 +#define __NR_rbed_tracedump 257 +#define __NR_rate_adjust 258 +#define __NR_getperiod 259 +#define __NR_setqos 260 /* user-visible error numbers are in the range -1 - -124: see */ diff --unified --recursive --new-file linux-2.4.20/include/linux/list.h linux-2.4.20-rbed/include/linux/list.h --- linux-2.4.20/include/linux/list.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/include/linux/list.h 2003-09-26 18:22:15.000000000 -0700 @@ -88,7 +88,7 @@ * @entry: the element to delete from the list. * Note: list_empty on entry does not return true after this, the entry is in an undefined state. */ -static inline void list_del(struct list_head *entry) +static void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = (void *) 0; diff --unified --recursive --new-file linux-2.4.20/include/linux/sched.h linux-2.4.20-rbed/include/linux/sched.h --- linux-2.4.20/include/linux/sched.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/include/linux/sched.h 2004-03-28 22:24:12.000000000 -0800 @@ -76,12 +76,17 @@ extern int nr_running, nr_threads; extern int last_pid; +extern int nr_running_rbed[]; +extern int nr_task_rbed[]; +extern int wrap_be_flag; + #include #include #include #include #ifdef __KERNEL__ #include +#include #endif #include @@ -119,15 +124,38 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 +/*2003-2-18 following definitions added by caixue Lin*/ +#define SCHED_RBED 4 +#define BE 0 +#define SRT 1 +#define HRT 2 +#define MDSRT 0 +#define RASRT 1 +#define RBSRT 2 + /* * This is an additional bit set when we want to * yield the CPU for one re-schedule.. */ #define SCHED_YIELD 0x10 +/*2003-2-18 modified by caixue Lin*/ +/***************modify starts here***************/ struct sched_param { int sched_priority; + int srt_type; //srt process type: MDSRT,RASRT,RBSRT + int process_type; //process type: HRT,SRT,BE + int nr_qos_level; //the number of qos levels + int qos_scaler; //the number of qos levels + struct qos_struct * qos; //qos specifications +}; +struct qos_struct { + long benefit; //qos benefit, <=1000(normalized) + long wcet; //worst case execution time, + unsigned long period; //period, <=5s right now, otherwise overflow. + unsigned long bufferSize; //buffer size, for example sound card read data from memory buffer }; +/***************modify ends here***************/ struct completion; @@ -157,11 +185,14 @@ #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); +asmlinkage void kern_schedule(void); +asmlinkage void kern_do_schedule(struct pt_regs); extern int schedule_task(struct tq_struct *task); extern void flush_scheduled_tasks(void); extern int start_context_thread(void); extern int current_is_keventd(void); +extern void exit_edf( pid_t ); // KSB /* * The default fd array needs to be at least BITS_PER_LONG, @@ -325,6 +356,9 @@ * that's just fine.) */ struct list_head run_list; + struct list_head rt_run_list; + struct list_head be_run_list; + struct list_head srt_task_list; unsigned long sleep_time; struct task_struct *next_task, *prev_task; @@ -418,6 +452,29 @@ /* journalling filesystem info */ void *journal_info; + +/* rbed main process property fields*/ + unsigned long srt_type; //SRT: MD, RA, RB + unsigned long process_type; //HRT, SRT, BE + unsigned long period; + unsigned long actual_period; //actual period + unsigned long previous_period; //previous period + unsigned long wcet; + unsigned long long release_time; + unsigned long long completion_time; + unsigned long long initial_release_time; + int rbed_state; + int weight; +/* rbed apic timer fields*/ + unsigned long long apic_start_tsc; + long long apic_left_tsc; + struct apic_timer_list * apic_timer; +/* rbed process qos fields*/ + int nr_qos_level; //number of qos levels, <=10 + int ct_qos_level; //current qos level, [0..9] + struct qos_struct * qos;//qos specifications + long qos_scaler; //qos scale factor for controlling external errors + long prev_qos_scaler; //save previous qos scale factor }; /* @@ -485,6 +542,9 @@ cpus_runnable: -1, \ cpus_allowed: -1, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ + rt_run_list: LIST_HEAD_INIT(tsk.rt_run_list), \ + be_run_list: LIST_HEAD_INIT(tsk.be_run_list), \ + srt_task_list: LIST_HEAD_INIT(tsk.srt_task_list), \ next_task: &tsk, \ prev_task: &tsk, \ p_opptr: &tsk, \ @@ -510,6 +570,25 @@ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ + srt_type: 0, \ + process_type: 0, \ + period: 0, \ + actual_period: 0, \ + previous_period: 0, \ + wcet: 0, \ + release_time: 0, \ + completion_time: 0, \ + initial_release_time: 0, \ + rbed_state: 0, \ + weight: 0, \ + apic_start_tsc: 0, \ + apic_left_tsc: 0, \ + apic_timer: NULL, \ + nr_qos_level: 0, \ + ct_qos_level: 0, \ + qos: NULL, \ + qos_scaler: 1, \ + prev_qos_scaler: 1, \ } @@ -517,16 +596,27 @@ # define INIT_TASK_SIZE 2048*sizeof(long) #endif +#ifndef INIT_RT_TASK_SIZE +#define INIT_RT_TASK_SIZE 2048*sizeof(long) +#endif +#ifndef INIT_BE_TASK_SIZE +#define INIT_BE_TASK_SIZE 2048*sizeof(long) +#endif + + union task_union { struct task_struct task; unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; }; - extern union task_union init_task_union; +extern union tak_union init_rt_task_union; +extern union tak_union init_be_task_union; extern struct mm_struct init_mm; extern struct task_struct *init_tasks[NR_CPUS]; +extern struct list_head srt_taskqueue_head; + /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) extern struct task_struct *pidhash[PIDHASH_SZ]; @@ -856,6 +946,10 @@ }) #define REMOVE_LINKS(p) do { \ + if((p)->process_type == 1){ \ + list_del(&(p)->srt_task_list); \ + (p)->srt_task_list.next = NULL; \ + } \ (p)->next_task->prev_task = (p)->prev_task; \ (p)->prev_task->next_task = (p)->next_task; \ if ((p)->p_osptr) \ @@ -869,16 +963,19 @@ #define SET_LINKS(p) do { \ (p)->next_task = &init_task; \ (p)->prev_task = init_task.prev_task; \ - init_task.prev_task->next_task = (p); \ - init_task.prev_task = (p); \ - (p)->p_ysptr = NULL; \ - if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ - (p)->p_osptr->p_ysptr = p; \ - (p)->p_pptr->p_cptr = p; \ - } while (0) + init_task.prev_task->next_task = (p); \ + init_task.prev_task = (p); \ + (p)->p_ysptr = NULL; \ + if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ + (p)->p_osptr->p_ysptr = p; \ + (p)->p_pptr->p_cptr = p; \ + if((p)->process_type == 1) \ + list_add_tail(&(p)->srt_task_list, &srt_taskqueue_head);\ +} while (0) #define for_each_task(p) \ - for (p = &init_task ; (p = p->next_task) != &init_task ; ) + for (p = &init_task ; (p = p->next_task) != &init_task ; ) + #define for_each_thread(task) \ for (task = next_thread(current) ; task != current ; task = next_thread(task)) @@ -894,6 +991,21 @@ p->sleep_time = jiffies; list_del(&p->run_list); p->run_list.next = NULL; + /* 2003-01-19 modified by caixue lin*/ + nr_running_rbed[p->process_type]--; + switch(p->process_type){ + case 0: list_del(&p->be_run_list); + p->be_run_list.next = NULL; + nr_task_rbed[0] --; + wrap_be_flag= 2; //2: BE process exits the system. + break; + case 1: + case 2: list_del(&p->rt_run_list); + p->rt_run_list.next = NULL; + break; + default: + } + /***************modify ends here***************/ } static inline int task_on_runqueue(struct task_struct *p) @@ -901,6 +1013,15 @@ return (p->run_list.next != NULL); } +static inline int task_on_rt_runqueue(struct task_struct *p) +{ + return (p->rt_run_list.next != NULL); +} +static inline int task_on_be_runqueue(struct task_struct *p) +{ + return (p->be_run_list.next != NULL); +} + static inline void unhash_process(struct task_struct *p) { if (task_on_runqueue(p)) diff --unified --recursive --new-file linux-2.4.20/include/linux/sys.h linux-2.4.20-rbed/include/linux/sys.h --- linux-2.4.20/include/linux/sys.h 1995-12-10 20:56:37.000000000 -0800 +++ linux-2.4.20-rbed/include/linux/sys.h 2004-02-04 19:11:43.000000000 -0800 @@ -4,7 +4,7 @@ /* * system call entry points ... but not all are defined */ -#define NR_syscalls 256 +#define NR_syscalls 262 /* * These are system calls that will be removed at some time diff --unified --recursive --new-file linux-2.4.20/include/linux/timer.h linux-2.4.20-rbed/include/linux/timer.h --- linux-2.4.20/include/linux/timer.h 2001-11-22 11:46:19.000000000 -0800 +++ linux-2.4.20-rbed/include/linux/timer.h 2003-11-02 23:04:41.000000000 -0800 @@ -23,6 +23,11 @@ extern void add_timer(struct timer_list * timer); extern int del_timer(struct timer_list * timer); +extern int apic_sleep(struct task_struct *, unsigned long long); +extern int apic_start(struct task_struct *, long long); +extern int apic_over(struct task_struct *); +extern void reset_process(struct task_struct *); + #ifdef CONFIG_SMP extern int del_timer_sync(struct timer_list * timer); extern void sync_timers(void); diff --unified --recursive --new-file linux-2.4.20/include/linux/uka_apic_timer.h linux-2.4.20-rbed/include/linux/uka_apic_timer.h --- linux-2.4.20/include/linux/uka_apic_timer.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.20-rbed/include/linux/uka_apic_timer.h 2003-04-15 21:41:38.000000000 -0700 @@ -0,0 +1,77 @@ +/* -*- linux-c -*- + * uka_apic_timer.h + * + * Module providing precise timers using the local APIC timer. + * + * Copyright (C) 2000 Vincent Oberle (vincent@oberle.com) + * Institute of Telematics, University of Karlsruhe, Germany. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + */ + +#ifndef _UKA_APIC_TIMER_H +#define _UKA_APIC_TIMER_H + +/*flag to notify the kernel the apic timer is ready to use*/ +extern int apic_timer_setup_flag; +/* + * A timer struct fot APIC timers, + * similar to the generic timer_list in timer.h + * + * "expires" is the value of the TSC register when the timer expires. + * "function" is the function to execute. + * Its parameters are 1) the expires value 2) the data value. + * "data" is a value/pointer that will be passed to the function. + */ +struct apic_timer_list { + struct apic_timer_list *next, *prev; + unsigned long long expires; + unsigned long data; + void (*function)(unsigned long long, unsigned long); +}; + + +/* Initializes a APIC timer list struct. */ +static inline void init_apic_timer (struct apic_timer_list *timer) +{ + timer->next = timer->prev = NULL; +} + +/* Adds an APIC timer. */ +extern int add_apic_timer (struct apic_timer_list *timer); + +/* Removes an APIC timer. */ +extern int del_apic_timer (struct apic_timer_list *timer); + +/* + * More efficient way to update the expire field of an active timer + * (if the timer is inactive it will be activated). + * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a) + */ +extern int mod_apic_timer (struct apic_timer_list *timer, unsigned long long expires); + +/*added by caixue lin on 04/14/03*/ +extern int setup_uka_apic_timer (); +extern void print_apic_timer_list (void); + +/* + * Useful macros to get the high and the low 32 bits of a 64-bits + * unsigned long long (ULL), and to convert the high + low part in the + * 64-bits ULL. + */ +#define ULL_LOW(x) (unsigned long)(x & 0xFFFFFFFF) +#define ULL_HIGH(x) (unsigned long)(x >> 32) +#define TO_ULL(d, a) (((unsigned long long)d << 32) + a) + +/* + * Absolute value of the difference. + */ +#define DIFF_ABS(a,b) (((a) > (b)) ? ((a) - (b)) : ((b) - (a))) + +#endif /* _UKA_APIC_TIMER_H */ + + + + diff --unified --recursive --new-file linux-2.4.20/kernel/fork.c linux-2.4.20-rbed/kernel/fork.c --- linux-2.4.20/kernel/fork.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/kernel/fork.c 2004-03-28 18:02:05.000000000 -0800 @@ -23,6 +23,8 @@ #include #include +// 04/13/2003 added by caixue lin +#include #include #include #include @@ -45,6 +47,7 @@ wait->flags &= ~WQ_FLAG_EXCLUSIVE; wq_write_lock_irqsave(&q->lock, flags); __add_wait_queue(q, wait); + wq_write_unlock_irqrestore(&q->lock, flags); } @@ -640,6 +643,8 @@ p->run_list.next = NULL; p->run_list.prev = NULL; + p->rt_run_list.next = NULL; + p->rt_run_list.prev = NULL; p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); @@ -713,8 +718,22 @@ */ p->counter = (current->counter + 1) >> 1; current->counter >>= 1; - if (!current->counter) - current->need_resched = 1; + + /*2003-1-19 modified by caixue lin*/ + /***************modify starts here***************/ + p->policy = SCHED_RBED; + p->process_type = 0; //default is BE + p->wcet = 5000; //default wcet 5000us=5ms. Note: this is never used. + p->weight= 12; //the maximum weight is set to 12 + + if(!apic_timer_setup_flag){ + if (!current->counter) + current->need_resched = 1; + } + else + if(!current->apic_left_tsc) + current->need_resched = 1; + /***************modify starts here***************/ /* * Ok, add it to the run-queues and make it diff --unified --recursive --new-file linux-2.4.20/kernel/sched.c linux-2.4.20-rbed/kernel/sched.c --- linux-2.4.20/kernel/sched.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/kernel/sched.c 2004-03-28 22:23:45.000000000 -0800 @@ -30,6 +30,10 @@ #include #include +/*to use 64 bit div*/ +#include + + #include #include @@ -37,6 +41,13 @@ extern void tqueue_bh(void); extern void immediate_bh(void); +//Following functions are defined in timer.c +extern int apic_sleep(struct task_struct *, unsigned long long); +extern int apic_start(struct task_struct *, long long); +extern int apic_over(struct task_struct *); +extern void reset_process(struct task_struct *); +//asmlinkage int sys_rbed_deadline_met(pid_t pid); + /* * scheduler variables */ @@ -70,7 +81,6 @@ #define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) - /* * Init task must be ok at boot for the ix86 as we will check its signals * via the SMP irq return path. @@ -94,10 +104,20 @@ static LIST_HEAD(runqueue_head); +/* added by caixue lin 5/03*/ +static LIST_HEAD(rt_runqueue_head); +static LIST_HEAD(be_runqueue_head); + +//The following definition define a separate SRT task queue from other tasks. +//We can't use 'static' since it is refered in sched.h +LIST_HEAD(srt_taskqueue_head); +/* add ends here*/ + /* * We align per-CPU scheduling data on cacheline boundaries, * to prevent cacheline ping-pong. */ + static union { struct schedule_data { struct task_struct * curr; @@ -125,6 +145,664 @@ #endif +/*added by caixue lin:02/05/2003*/ +/* kernel modification adds here*/ +#define RBED_DEBUG 1 + +#define ALPHA ((long)(0)) +#define BETA ((long)(30))//minimum reserved usage for BEs +#define QOS 1 //QoS policy: underloaded +#define WRAP 2 //Weighted share policy: overloaded +#define QOS_MAX_LEVEL 10 //QoS Levels: [0..9] +#define QOS_RAISE_LIMIT 0 //The minimum resource which triggers the QoS adaptation +#define DEADLINE_MEET 0x00 +#define DEADLINE_MISS 0x10 + +int nr_running_rbed[3]={0,0,0}; //nr of runnable processes of HRT,SRT and BE. +int nr_task_rbed[3]={0,0,0}; //nr of all processes of HRT,SRT and BE. +long rbed_target_rates[3] = {0,0,0};//the sum actual rates for all HRT,SRT and BE respectively. +long rbed_srt_rates_highest = 0; //the sum target rates for all SRTs with highest QoS levels +long rbed_srt_rates_lowest = 0; //the sum target rates for all SRTs with lowest QoS levels +long total_srt_benefit = 0; //records the total benefits for all SRTs + +//parameters for controlling the release time of BE processes +u64 min_be_release_time=0; +u64 base_be_release_time=0; //reset to 0 if no RT process exist + +int rbed_rap = QOS; //resource allocation policy: QOS or WRAP +int qos_flag=0; //for opti (0) or prop (1) qos +int wrap_be_flag = 0; //resource reallocation flag for BE processes +int wrap_flag=0; //for debugging + +//#define TRACE_SIZE 1024 +#define TRACE_SIZE 1536 +struct rbed_rt_trace +{ + unsigned int pid; + unsigned int state; + unsigned long long period; + unsigned long long response_time; +} rt_trace_data[TRACE_SIZE]; + +int rt_trace_next=0; +int rt_trace_overflow=0; + +//RT trace +static inline void rbed_rt_trace_add(struct task_struct *p) +{ + if(rt_trace_next == TRACE_SIZE) + { + rt_trace_next=0; + rt_trace_overflow=1; + } + + rt_trace_data[rt_trace_next].pid = p->pid; + rt_trace_data[rt_trace_next].period= p->period; + + rt_trace_data[rt_trace_next].response_time = (p->completion_time - p->initial_release_time); + rt_trace_data[rt_trace_next].state= p->state; + + rt_trace_next++; +} + +struct rbed_trace +{ + unsigned int process_type; + unsigned int pid; + unsigned int state; + unsigned int run; + unsigned long long time; + unsigned long long release_time; + unsigned long actual_period; + long long counter; + int weight; + unsigned long wcet; + unsigned long schedule_overhead; //in tsc + unsigned long edf_overhead; //in tsc + int nr_rt; //number of rt processes + int wrap_flag; +} trace_data[TRACE_SIZE]; + +int trace_run=0; +int trace_next=0; +int trace_overflow=0; +u64 trace_start_time=0; + +//all trace including RT and BE processes +static inline void rbed_trace_add(struct task_struct *p, unsigned long schedule_overhead, + unsigned long edf_overhead, int wrap_flag) +{ + u64 t; + unsigned long low,high; + + rdtsc(low,high); + t = TO_ULL(high,low); + + if(trace_next==0 && trace_start_time ==0)//from the very begining of the trace + trace_start_time = t; + + if(trace_next == TRACE_SIZE) + { + trace_next=0; + trace_overflow=1; + } + + trace_data[trace_next].process_type= p->process_type; + trace_data[trace_next].pid = p->pid; + trace_data[trace_next].state= p->state; + trace_data[trace_next].run = trace_run; + trace_data[trace_next].time = t; + trace_data[trace_next].release_time = p->initial_release_time; + trace_data[trace_next].actual_period = p->actual_period; + trace_data[trace_next].counter = p->counter; + trace_data[trace_next].weight = p->weight; + trace_data[trace_next].wcet= p->wcet; + trace_data[trace_next].schedule_overhead= schedule_overhead; + trace_data[trace_next].edf_overhead= edf_overhead; + trace_data[trace_next].nr_rt= nr_running_rbed[SRT] + nr_running_rbed[HRT]; + trace_data[trace_next].wrap_flag= wrap_flag; + trace_next++; +} + +/*A heuristic method that adjusts the weight of a BE process according + * to its original weight and properties (I/O or CPU) + */ +static inline long adjust_be_weight(struct task_struct * p){ + long adjusted_weight, be_num; + + if(p->weight <=1) + adjusted_weight = 10; + else{ + be_num = nr_running_rbed[BE]; + if(be_num <= 1) //only one running process or ilde process + return p->weight; //either p->weight or p->weight*10 + be_num *=10; + adjusted_weight = ((p->weight)*10)/6; + if(adjusted_weight == be_num)//avoids zero-division error + be_num +=10; + + //The following calculation ensures the boosted wcet is about + //120ms (weight 12) and the sequential wcets are about 60ms + adjusted_weight = adjusted_weight + adjusted_weight*(adjusted_weight-10)/(be_num - adjusted_weight); + if(adjusted_weight<=0) + adjusted_weight =10; + } + + return adjusted_weight; +} + +/*get the total weight of the runnable BE processes*/ +static inline long get_be_wsum(){ + struct task_struct * p ; + struct list_head *tmp; + long be_wsum = 0; + + list_for_each(tmp, &be_runqueue_head) { + p = list_entry(tmp, struct task_struct, be_run_list); + if(p->process_type == BE){ + be_wsum += adjust_be_weight(p); + } + } + + return be_wsum; +} + +static inline void reset_be_weights(){ + struct task_struct * p ; + struct list_head *tmp; + int reset_flag = 1; + + //check if we need to reset the weights + list_for_each(tmp, &be_runqueue_head) { + p = list_entry(tmp, struct task_struct, be_run_list); + if(p->weight >0){ + reset_flag = 0; + goto finish; + } + } + + //reset all the weights for be processes if needed + if(reset_flag ==1){ + for_each_task(p) + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED&&p->process_type==BE){ + if(p->state!=TASK_RUNNING) //boost for sleeping I/O bound process + p->weight = (20/4 +1) + p->weight/2; + else //reset for runnable process + p->weight = 1; + } + } + +finish: +} + + +static inline void do_wrap_for_each(struct task_struct *p) +{ + unsigned long rbed_actual_rate, target_rate=1; + long be_wsum = 0; + int adjusted_weight = 0; + + switch(p->process_type) + { + case HRT: //do nothing for HRT + target_rate = p->wcet*1000/p->period; + rbed_actual_rate = target_rate; + break; + case SRT: //do weighted proportional resource allocation for SRTs + //based on their highest qos level + target_rate = p->qos[0].wcet*1000/p->qos[0].period; + rbed_actual_rate =min(target_rate, ((1000 - ALPHA -BETA - rbed_target_rates[HRT]) + *target_rate+rbed_srt_rates_highest/2)/rbed_srt_rates_highest); + break; + case BE: + default: + adjusted_weight = adjust_be_weight(p); + be_wsum = get_be_wsum(); + if(be_wsum <= 0) + be_wsum = 10;//salce up with 10 + rbed_actual_rate = max(BETA, (1000 - ALPHA - rbed_target_rates[HRT] - + rbed_target_rates[SRT])) * adjusted_weight/be_wsum; + break; + } + + if(rbed_actual_rate==0) + rbed_actual_rate=1; + + if(p->process_type != BE ) // RT + p->actual_period = (p->wcet)*1000/rbed_actual_rate; + else{ // BE + p->actual_period = p->period = 60000*(nr_running_rbed[0]); //dynamic period: (n)*60ms + p->wcet = (p->actual_period * rbed_actual_rate)/1000; + } +} + +//We do not need to do reallocation for every BE process. +//Especially BE, we do lazy resource allocation. +static inline void do_wrap() +{ + struct task_struct * p; + struct list_head *tmp; + + //do weighted resource reallocation for all processes except HRTs + spin_lock_irq(&runqueue_lock); + read_lock(&tasklist_lock); + list_for_each(tmp, &srt_taskqueue_head) { + p = list_entry(tmp, struct task_struct, srt_task_list); + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED) + do_wrap_for_each(p); + } + read_unlock(&tasklist_lock); + spin_unlock_irq(&runqueue_lock); +} + +/* do_rate_constraints(): main dynamic rate adjustment function. + * Increase utilization/shorten period and decrease utilization/extend period + * constraints: current we only consider rate adjustment by changing period + */ +static inline void do_rate_constraints(){ + struct task_struct * p; + struct list_head *tmp; + int new_qos_level; + unsigned long low,high; + long new_period=0; + u64 ct; + + //get the current tsc; + rdtsc(low,high); + ct = ((u64)high<<32) + low; + + //go through the srt task queue + list_for_each(tmp, &srt_taskqueue_head) { + p = list_entry(tmp, struct task_struct, srt_task_list); + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED) + { + //init new qos level to the next expected qos level + new_qos_level = p->ct_qos_level; + //it's always ok to extend a period + if(p->previous_period == p->qos[p->ct_qos_level].period) + continue; + else if(p->previous_period < p->qos[p->ct_qos_level].period) + { + //ignore the sleeping && extending case: no benefit available + if(p->state==TASK_UNINTERRUPTIBLE||p->state==TASK_INTERRUPTIBLE) + continue; + + new_qos_level = p->ct_qos_level; + new_period = p->qos[p->ct_qos_level].period; + } + else{ //shorten period: this is not an easy thing + //just ignore the shortening period case for the + //process which has no input error + if(p->prev_qos_scaler <=1) + continue; + + u64 r, t1=0; + //1. if p is sleeping, it's safe to shorten until current time + if(p->state==TASK_UNINTERRUPTIBLE||p->state==TASK_INTERRUPTIBLE) + { + //get the previous release time + r = p->release_time - p->previous_period*(cpu_khz/1000); + r = ct - r; + do_div(r,(unsigned long)(cpu_khz/1000)); //convert tsc to us + new_period = r; + goto done_new_period; + } + + //2. if p is running + long new_rate = p->qos[p->ct_qos_level].wcet*1000/p->qos[p->ct_qos_level].period; + u64 left_tsc = p->apic_left_tsc; + + t1 = ct - p->release_time; + if((s64)t1<0) + t1=0; + + do_div(t1,(unsigned long)(cpu_khz/1000)); //convert tsc to us + do_div(left_tsc,(unsigned long)(cpu_khz/1000)); //convert tsc to us + new_period = t1 + (((long)left_tsc)*1000)/new_rate; + + if(new_period > p->actual_period){ + continue; + } + +done_new_period: + if(new_period <= p->qos[p->ct_qos_level].period){ //it's safe to shorten period + new_qos_level = p->ct_qos_level; + new_period = p->qos[p->ct_qos_level].period; + } + else{ ;//do nothing + } + } + + //if p is sleeping, adjust its sleep time if needed + if(p->state==TASK_UNINTERRUPTIBLE|| p->state==TASK_INTERRUPTIBLE) + { + s64 offset = ((s64)new_period - (s64)p->previous_period)*(cpu_khz/1000); + + //if update is needed + if(offset != 0) //if using abs(offset) > threshold, better??? + { + s64 new_exp=(s64)p->release_time + offset; + + //if already passed current time, then done + if(new_exp <= (s64)ct||(u64)new_exp <= ct) + new_exp = (s64)ct; + + //if already using an apic_timer, update it + if(p->apic_timer!=NULL && p->apic_timer!=0) + { + mod_apic_timer(p->apic_timer, (u64)new_exp); + u64 sleeping_time = (u64)new_exp-ct; + do_div(sleeping_time,(unsigned long)(cpu_khz/1000)); + } + else//if not using an apic_timer, i.e. using udelay() + { + if(offset > 0){ //using a new apic timer + apic_sleep(p, new_exp - ct); + } + else //no need anything, done + ;//printk("not err:too small offset\n"); + } + } + + //1. update the previous actual period + p->release_time -= p->previous_period*(cpu_khz/1000); + p->previous_period = new_period; + p->release_time += p->previous_period*(cpu_khz/1000); + //2. update the next actual period (for next release and EDF comparison) + p->period = p->actual_period = p->qos[p->ct_qos_level].period; + p->wcet = p->qos[p->ct_qos_level].wcet; + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + } + else{//if p is running + //update the previous and actual period (Both for current actual period) + p->previous_period = new_period; + p->period = p->actual_period = p->previous_period; + p->wcet = p->qos[new_qos_level].wcet; + //important: we should also update current left_tsc, for now just + //ignore it because all qos levels have the same wcet + } + } + } +} + +/*Proportional QoS: raise or lower qos level proportional to benefit*/ +static inline void do_prop_qos(long slack_rate, int adapt_flag){ + struct task_struct * p; + struct list_head *tmp; + long left_srt_rate, new_scaled_rate, extra_scaled_rate; + long left_srt_benefit; + + if(slack_rate >= QOS_RAISE_LIMIT){ + //compute the available resource usage for srt processes + left_srt_rate = 1000 - ALPHA - BETA - rbed_target_rates[HRT]; + left_srt_benefit = total_srt_benefit; + + list_for_each(tmp, &srt_taskqueue_head) { + p = list_entry(tmp, struct task_struct, srt_task_list); + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED){ + long rate2 = p->qos[2].wcet*1000/p->qos[2].period; + long rate0 = p->qos[0].wcet*1000/p->qos[0].period; + + if(p->qos_scaler==0){ //we are done if there is no input error + left_srt_rate -=rate2; + continue; + } + + new_scaled_rate = (left_srt_rate*p->qos_scaler) /left_srt_benefit; + if(new_scaled_rate <= rate2){ + new_scaled_rate = rate2; + p->ct_qos_level=2; //lowest level + } + else if (new_scaled_rate >= rate0){ + new_scaled_rate = rate0; + p->ct_qos_level=0; //highest level + } + else{ + p->ct_qos_level=1; //middle level + p->qos[1].period = p->qos[1].wcet*1000/new_scaled_rate; + } + + //minus the assigned new rate + left_srt_rate -= new_scaled_rate; + left_srt_benefit -= p->qos_scaler; + rbed_target_rates[SRT] += (new_scaled_rate-rate2); + } + } + + //If there is not enough extra resources which is not + //worthing reallocation, we are done. + if(left_srt_rate<10) //less than 10/1000=1% + goto done_adjustment1; + + left_srt_benefit = total_srt_benefit; + //Here we go through the SRT task list the 2nd time (but from the + // backward) to distribute the extra resources + list_for_each_prev(tmp, &srt_taskqueue_head) { + if(left_srt_rate == 0) + break; + p = list_entry(tmp, struct task_struct, srt_task_list); + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED){ + //we are done if there is no input error + if(p->qos_scaler==0) + goto done_adjustment1; + + //if p is already fully allocated, continue; + //otherwise, distribute the extra resource proportionally + if(p->ct_qos_level==0){ //not highest level + left_srt_benefit -= p->qos_scaler; + continue; + } + else + { + extra_scaled_rate = (left_srt_rate*p->qos_scaler)/left_srt_benefit; + //make sure we distribute something every time + if(extra_scaled_rate==0) + extra_scaled_rate = left_srt_rate; + + new_scaled_rate = p->qos[p->ct_qos_level].wcet*1000/p->qos[p->ct_qos_level].period; + new_scaled_rate += extra_scaled_rate; + + long rate0=p->qos[0].wcet*1000/p->qos[0].period; + if (new_scaled_rate >= rate0){ + new_scaled_rate = rate0; + extra_scaled_rate -= (new_scaled_rate-rate0); //delete the overflow part + p->ct_qos_level=0; //highest level + } + else{ + p->ct_qos_level=1; //middle level + p->qos[1].period = p->qos[1].wcet*1000/new_scaled_rate; + } + + //minus the assigned extra rate + left_srt_rate -= extra_scaled_rate; + left_srt_benefit -= p->qos_scaler; + rbed_target_rates[SRT] += extra_scaled_rate; + } + } + } + } + +done_adjustment1: + //subject to dynamic rate and periods adjustment constraints + do_rate_constraints(); +} + +/*Optimal QoS: give maximum resource to the process with largest benefit*/ +static inline void do_opti_qos(long slack_rate, int adapt_flag){ + struct task_struct * p; + struct list_head *tmp; + long left_srt_rate, new_scaled_rate, extra_scaled_rate; + long left_srt_benefit; + int first_task,nr_srt; + + if(slack_rate >= QOS_RAISE_LIMIT){ + first_task = 1; + nr_srt = nr_task_rbed[SRT]; + + //compute the available resource usage for srt processes + left_srt_rate = 1000 - ALPHA - BETA - rbed_target_rates[HRT]; + + //Go thourgh the list barkward, do resource distribution + //proportionally to the benefit (except the first one) + list_for_each_prev(tmp, &srt_taskqueue_head) { + p = list_entry(tmp, struct task_struct, srt_task_list); + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED){ + //we are done if there is no input error + if(p->qos_scaler==0) + goto done_adjustment2; + + long rate2 = p->qos[2].wcet*1000/p->qos[2].period; + long rate0 = p->qos[0].wcet*1000/p->qos[0].period; + //If it's the first SRT task (highest qos benefit), + //we allocate maximum resources to it. + if(first_task == 1&&p->qos_scaler!=1){ + first_task = 0; + nr_srt --; + new_scaled_rate = left_srt_rate - nr_srt*rate2; + if(new_scaled_rate >= rate0){ + p->ct_qos_level=0; //highest + new_scaled_rate = rate0; + } + else{ + p->ct_qos_level=1; //middle + p->qos[1].period = p->qos[1].wcet*1000/new_scaled_rate; + } + //minus the assigned new rate + left_srt_rate -= new_scaled_rate; + } + else//try to share the leftover + { + if(nr_srt!=0) + new_scaled_rate = left_srt_rate/nr_srt; + else//is this possible? + new_scaled_rate = left_srt_rate; + + if (new_scaled_rate <= rate2) + p->ct_qos_level=2; //lowest + else if (new_scaled_rate >= rate0) + p->ct_qos_level=0; //highest + else{ + p->ct_qos_level=1; //middle + p->qos[1].period = p->qos[1].wcet*1000/new_scaled_rate; + } + } + + rbed_target_rates[SRT] += (new_scaled_rate-rate2); + } + } + } + +done_adjustment2: + //subject to dynamic rate and periods adjustment constraints + do_rate_constraints(); +} + +/*reset the qos level of each SRT*/ +static inline void reset_qos_level(int level){ + struct task_struct * p; + struct list_head *tmp; + + list_for_each(tmp, &srt_taskqueue_head) { + p = list_entry(tmp, struct task_struct, srt_task_list); + if(p!=&init_task&&p->state!=TASK_ZOMBIE&&p->state!=TASK_STOPPED){ + p->ct_qos_level = level?p->nr_qos_level-1:0; + } + } + rbed_target_rates[SRT] = level?rbed_srt_rates_lowest:rbed_srt_rates_highest; +} + +/*Try to do qos management or wrap management according to the + * system status: underload or overload + */ +static inline void rbed_dra(int adapt_flag){ + long qos_slack_rate; + + //We are done if there is no SRT process in the system + if(nr_task_rbed[SRT] <= 0) + goto done_dra; + + //Start to reallocate resource now. + //1. Do WRAP if we don't have enough resource to fit + //the lowest QoS levels for all SRT processes + if(1000 - ALPHA - BETA - rbed_target_rates[HRT] - rbed_srt_rates_lowest<0){//From QOS -> WRAP + if(rbed_rap != WRAP) + rbed_rap = WRAP; + do_wrap(); + } + else{ // 2. Do QOS management + if(rbed_rap != QOS) //From WRAP -> QOS, raise level definitely + rbed_rap = QOS; + + //reset to lowest qos level, and then start to raise levels + reset_qos_level(1); + + //determine the slack rate for SRT processes. + qos_slack_rate = 1000 - ALPHA - BETA - rbed_target_rates[HRT] - rbed_target_rates[SRT]; + + //trigger the qos adaptation + if(qos_flag==0) + do_opti_qos(qos_slack_rate, adapt_flag); + else + do_prop_qos(qos_slack_rate, adapt_flag); + } +done_dra: + +} + + +/*Reset the scheduling parameters for the process when a process wakes up*/ +static inline void init_process(struct task_struct *p) +{ + struct list_head * tmp; + struct task_struct * prev_p; + s64 t; + unsigned long low,high; + + rdtsc(low,high); + t = ((u64)high<<32) + low; + + if(p->process_type == BE){ + if (min_be_release_time < t) + min_be_release_time = t; + p->release_time = min_be_release_time ; + + //update/accumulate the base release time for BE + //processes when there is RT process in the system + if(nr_task_rbed[SRT]>0||nr_task_rbed[HRT]>0){ + //find and assign the possible earliest release time + //of the ealiest BE to the new BE process; + tmp= &be_runqueue_head; + + if(tmp->next!=tmp){ //more than one BE process in runqueue + prev_p = list_entry(tmp->next, struct task_struct, be_run_list); + p->release_time = prev_p->release_time; + //Is the next sentence correct? will it overflow? + base_be_release_time += prev_p->actual_period*(cpu_khz/1000) - ((prev_p->actual_period + *1000)/prev_p->wcet) * ((long)prev_p->apic_left_tsc/1000); + } + else //no BE in runqueue: reset the base BE release time to 0 + base_be_release_time = 0; + } + else //no RT process in system: reset the base BE release time to 0 + base_be_release_time = 0; + + p->apic_left_tsc= p->wcet*(cpu_khz/1000); + } + else{ //for RT + + p->release_time = t; + p->period = p->actual_period = p->qos[p->ct_qos_level].period; + p->wcet = p->qos[p->ct_qos_level].wcet; + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + } + p->apic_start_tsc = t; + + //for tracing data and debugging + p->initial_release_time = p->release_time; +} +/* kernel modification ends here*/ + + void scheduling_functions_start_here(void) { } /* @@ -140,8 +818,7 @@ * +ve: "goodness" value (the larger, the better) * +1000: realtime process, select this. */ - -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +static inline s64 goodness_normal(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) { int weight; @@ -157,7 +834,7 @@ /* * Non-RT process - normal case first. */ - if (p->policy == SCHED_OTHER) { + if (p->policy == SCHED_OTHER ||p->policy == SCHED_RBED) { /* * Give the process a first-approximation goodness value * according to the number of clock-ticks it has left. @@ -193,13 +870,35 @@ return weight; } +static inline s64 goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) +{ + /* 01/19/03 modified by caixue lin*/ + if (p->policy == SCHED_RBED || p->policy & SCHED_RBED) + { + //return the updated deadline + if(p->process_type==BE){ + //if there are more BE processes come in, we need + //to update the pseudo period immediately; otherwise we + //just delay the update of the pseudo period for BE + if(p->period < 60000*(nr_running_rbed[0])) + p->actual_period = p->period = 60000*(nr_running_rbed[0]); + return base_be_release_time + p->release_time + p->actual_period*(cpu_khz/1000); + } + else + return p->release_time + p->actual_period*(cpu_khz/1000); + } + else + return goodness_normal(p, this_cpu, this_mm); +} + /* * the 'goodness value' of replacing a process on a given CPU. * positive value means 'replace', zero or negative means 'dont'. */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +//static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline s64 preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + return goodness(prev, cpu, prev->active_mm) - goodness(p, cpu, prev->active_mm); } /* @@ -283,7 +982,8 @@ } } else { if (oldest_idle == -1ULL) { - int prio = preemption_goodness(tsk, p, cpu); + /*01/29/03 added by caixue lin*/ + s64 prio = preemption_goodness(tsk, p, cpu); if (prio > max_prio) { max_prio = prio; @@ -330,6 +1030,27 @@ { list_add_tail(&p->run_list, &runqueue_head); nr_running++; + + /* 2003-01-19 following statements added by caixue lin*/ + nr_running_rbed[p->process_type]++; + switch(p->process_type){ + case 0: + //add to the head of the be run queue + list_add(&p->be_run_list, &be_runqueue_head); + wrap_be_flag = 1; //1: BE process enters the system. + break; + case 1: + case 2: + list_add_tail(&p->rt_run_list, &rt_runqueue_head); //add_to_rt_runqueue(p); + break; + default: + } +} + +void move_be_last_runqueue(struct task_struct * p) +{ + list_del(&p->be_run_list); + list_add_tail(&p->be_run_list, &be_runqueue_head); } static inline void move_last_runqueue(struct task_struct * p) @@ -356,11 +1077,34 @@ */ spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; - if (task_on_runqueue(p)) + + /*modification starts here*/ + if (task_on_runqueue(p)){ + init_process(p); goto out; + } + + if(p->process_type==BE) + nr_task_rbed[p->process_type]++; //update the count of the live tasks in the system. + + init_process(p); add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) + + if(p->process_type==BE){ + p->actual_period = p->period = 60000*nr_running_rbed[BE]; + rbed_target_rates[p->process_type] += (p->wcet*1000 + (p->period)/2)/p->period; + } + + current->need_resched = 1; + + /* + if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))){ reschedule_idle(p); + + } + */ + /*modification ends here*/ + success = 1; out: spin_unlock_irqrestore(&runqueue_lock, flags); @@ -549,8 +1293,19 @@ struct schedule_data * sched_data; struct task_struct *prev, *next, *p; struct list_head *tmp; - int this_cpu, c; - + /* 2003-02-18 modified by caixue lin*/ + /***************modify starts here***************/ + //int this_cpu, c; + int this_cpu; + s64 c; + s64 current_tsc, start_tsc, end_tsc, edf_start_tsc; + unsigned long low,high; + unsigned long schedule_overhead, edf_overhead; + +#ifdef RBED_DEBUG + rdtsc(low,high); + start_tsc= ((u64)high<<32) + low; +#endif spin_lock_prefetch(&runqueue_lock); @@ -588,34 +1343,82 @@ break; } default: + //update the min possible release_time for BEs + if(prev->process_type == BE){ + min_be_release_time = prev->release_time - ((prev->actual_period * 1000)/prev->wcet) + * ((long)prev->apic_left_tsc/1000); + min_be_release_time += 60000*nr_running_rbed[BE]*(cpu_khz/1000); + } + del_from_runqueue(prev); + case TASK_RUNNING:; } prev->need_resched = 0; + /* Check if we need to reallocate the resource at this point. + * Can we move it to do_exit()? + */ + switch (prev->state) { + case TASK_ZOMBIE: + case TASK_STOPPED: + if(prev->process_type == SRT){ + rbed_srt_rates_highest -= prev->qos[0].wcet*1000/prev->qos[0].period; + rbed_srt_rates_lowest -= prev->qos[prev->nr_qos_level-1].wcet*1000 + /prev->qos[prev->nr_qos_level-1].period; + rbed_target_rates[SRT] -= prev->qos[prev->ct_qos_level].wcet*1000 + /prev->qos[prev->ct_qos_level].period; + total_srt_benefit -= prev->qos_scaler; + } + else + rbed_target_rates[prev->process_type] -= prev->wcet*1000/prev->period; + + //try to raise qos level + if(prev->process_type != BE){ + //update the count of the live tasks in the system. + //Note: BE number is updated in del_from_runqueue() in sched.h + nr_task_rbed[prev->process_type]--; + rbed_dra(0); +#ifdef RBED_DEBUG + wrap_flag=1; +#endif + } + + default:; + } + + reset_be_weights(); + + /* * this is the scheduler proper: */ repeat_schedule: + /* * Default process to select.. */ next = idle_task(this_cpu); + + //If apic timer is ready, we skip the normal Linux scheduling algorithm + if(apic_timer_setup_flag) + goto RBED; + c = -1000; list_for_each(tmp, &runqueue_head) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); + int weight = goodness_normal(p, this_cpu, prev->active_mm); if (weight > c) c = weight, next = p; } } - + /* Do we need to re-calculate counters? */ if (unlikely(!c)) { struct task_struct *p; - + spin_unlock_irq(&runqueue_lock); read_lock(&tasklist_lock); for_each_task(p) @@ -624,6 +1427,112 @@ spin_lock_irq(&runqueue_lock); goto repeat_schedule; } + + //here, because apic timer is not ready, we skip the RBED scheduling algorithm + goto normal_done; + + +RBED: +#ifdef RBED_DEBUG + rdtsc(low,high); + edf_start_tsc = ((u64)high<<32) + low; +#endif + + c = (s64)0x7FFFFFFFFFFFFFF;//7+14F, why not use 15F? + + //1. get the RT process with the earliest deadline + list_for_each(tmp, &rt_runqueue_head) { + p = list_entry(tmp, struct task_struct, rt_run_list); + + //if a RT never runs as RT before, refill the left_tsc + if(p->rbed_state == -1){ + p->rbed_state = 0; + //use current wcet? or current qos level??? + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + } + + if(p->apic_left_tsc <= 0 ){ + p->rbed_state = 0x20; //APIC_BRAKE -> overrun resource +#ifdef RBED_DEBUG + printk("schedule 1: running out of execution time\n"); +#endif + reset_process(p); + } + + if (can_schedule(p, this_cpu)) { + s64 t = goodness(p, this_cpu, prev->active_mm); + if ( t < c){ + c = t, next = p; + } + } + } + + //2. get the first BE process in the sorted BE runqueue + tmp= &be_runqueue_head; + if(tmp->next!=tmp){//more than one BE process in runqueue + s64 t; + p = list_entry(tmp->next, struct task_struct, be_run_list); + + if(p->apic_left_tsc <= 0 ) + reset_process(p); + + if (can_schedule(p, this_cpu)) { + t = goodness(p, this_cpu, prev->active_mm); + if ( t < c){ + c = t, next = p; + } + } + } + +#ifdef RBED_DEBUG + rdtsc(low,high); + current_tsc = ((u64)high<<32) + low; + edf_overhead = (unsigned long)(current_tsc - edf_start_tsc); +#endif + + //if we pick a BE to run, then check if we need reallocate resource for it + if(c < (s64)0x7FFFFFFFFFFFFFF && next->process_type == BE){ + if((nr_running_rbed[BE]>0)){ + long long old_tsc = next->wcet*(cpu_khz/1000); + long long used_tsc = old_tsc - next->apic_left_tsc; + + //do lazy resource allocation for this BE process + do_wrap_for_each(next); + wrap_be_flag=0; //reset the resource reallocation flag + next->apic_left_tsc = next->wcet*(cpu_khz/1000); //reset the tsc + next->apic_left_tsc -= used_tsc; + if(next->apic_left_tsc <= 0){ + goto repeat_schedule; + } + } + } + +normal_done: + /*************** apic one-shot timer starts here***************/ + rdtsc(low,high); + current_tsc = ((u64)high<<32) + low; + + //Is the apic timer ready to use? + if(apic_timer_setup_flag){ + if(prev != idle_task(this_cpu)){ + prev->apic_left_tsc -= (current_tsc - prev->apic_start_tsc); + //delete the old apic timer for the previous process + apic_over(prev); + } + if(next != idle_task(this_cpu)){ + next->apic_start_tsc = current_tsc; + + if(next->apic_left_tsc <= 0){ + next->rbed_state = 0x20; //overrun resource + if(next->process_type==SRT) + printk("schedule 2: running out of execution time\n"); + reset_process(next); + } + + //start the new apic timer for the next process + apic_start(next, next->apic_left_tsc); + } + } /* * from this point on nothing can prevent us from @@ -634,11 +1543,32 @@ task_set_cpu(next, this_cpu); spin_unlock_irq(&runqueue_lock); + if (unlikely(prev == next)) { /* We won't go through the normal tail, so do this by hand */ + + /* this is for old normal linux scheduling, but not rbed prev->policy &= ~SCHED_YIELD; + */ + +#ifdef RBED_DEBUG + trace_run++; +#endif + goto same_process; } +#ifdef RBED_DEBUG + else//not the same process, context switch + { + rdtsc(low,high); + current_tsc = ((u64)high<<32) + low; + schedule_overhead = (unsigned long)(current_tsc - start_tsc); + + rbed_trace_add(prev, schedule_overhead, edf_overhead, wrap_flag); + wrap_flag=0; + trace_run=0; + } +#endif #ifdef CONFIG_SMP /* @@ -699,9 +1629,26 @@ reacquire_kernel_lock(current); if (current->need_resched) goto need_resched_back; + return; } +asmlinkage void user_schedule(void) +{ +#ifdef CONFIG_KGDB_THREAD + current->thread.kgdbregs = NULL; +#endif + schedule(); +} + +#ifdef CONFIG_KGDB_THREAD +asmlinkage void kern_do_schedule(struct pt_regs regs) +{ + current->thread.kgdbregs = ®s; + schedule(); +} +#endif + /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the @@ -824,28 +1771,114 @@ return timeout; } +/*************rbed_sleep()**************** + * This function will be called from sys_rbed_deadline_met function. + * + * Get the current time using the rdtsc register, compare it to the + * deadline of the process. If completion time < deadline, sleep for + * deadline - completion_time, else don't sleep. + * Return value: + * 1) The last four bits are for qos levels and wrap flag + * 0x*0..0x*9 -> 0..9 qos levels + * 0x*A -> wrap + * 2) The next four bits are for deadline meet flags. + * 0x0* for made it + * 0x1* didn't make it + * 0x2* apic break occured before + ********************************************/ +static inline int rbed_sleep(struct task_struct *p) +{ + unsigned long low, high; + unsigned long long slack; + unsigned long long completion_time; + unsigned long long deadline; + int rtn = 0; + + /* first get the current time with the rdtsc register */ + /* largest resolution is 4GB because we can only use low */ + rdtsc(low,high); + + /* get the competion time of p in microseconds. */ + /* 32-bit value (is low portion of a 64-bit value) */ + //we do not short the deadline until the next release + deadline = p->release_time + p->actual_period*(cpu_khz/1000); + completion_time = p->completion_time = TO_ULL(high,low); + slack = (deadline - completion_time); + + rbed_rt_trace_add(p); //do trace here + + if(completion_time <= deadline)/* met deadline */ + { + //Rate-Based SRT class is special: release next job + //immediately after the previous job is done + if(p->process_type == SRT && p->srt_type == RBSRT){ + unsigned long bufferSize = p->qos[p->ct_qos_level].bufferSize; + unsigned long long bufferSizeInCycles = (u64)p->period; + bufferSizeInCycles *= ((cpu_khz/1000) * bufferSize); + + //unbounded buffer, or buffer is still not full + if(!bufferSize || slack < bufferSizeInCycles){ + //release next job immediately + reset_process(p); + current->need_resched = 1; + } + else{//buffer is overflow; block until half of the buffer is empty + //udelay is never used if buffer is not too small + apic_sleep(p, bufferSizeInCycles >> 1); + reset_process(p); + } + } + else{//HRT or other SRT classes + p->release_time = deadline; + if(apic_sleep(p, slack)){ //if use udelay, then reschedule now + init_process(p); + current->need_resched = 1; + } + } + rtn = p->rbed_state | DEADLINE_MEET; + p->rbed_state = 0x00; + } + else{ //we didn't make deadline start the next period right now + init_process(p); + current->need_resched = 1; + + printk("Miss deadline: completion_time= %llu , deadline= %llu, + diff=%ld\n", completion_time, deadline, completion_time - deadline); + + rtn = p->rbed_state | DEADLINE_MISS; + p->rbed_state = 0x00; + } + + if(rbed_rap == WRAP) + rtn |= 0x0A; //10:use WRAP + else + rtn |= p->ct_qos_level; //0..9:use QOS + + return rtn; +} + void sleep_on(wait_queue_head_t *q) { - SLEEP_ON_VAR + SLEEP_ON_VAR - current->state = TASK_UNINTERRUPTIBLE; + current->state = TASK_UNINTERRUPTIBLE; - SLEEP_ON_HEAD - schedule(); - SLEEP_ON_TAIL -} + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL + } long sleep_on_timeout(wait_queue_head_t *q, long timeout) { - SLEEP_ON_VAR + SLEEP_ON_VAR - current->state = TASK_UNINTERRUPTIBLE; + current->state = TASK_UNINTERRUPTIBLE; - SLEEP_ON_HEAD - timeout = schedule_timeout(timeout); - SLEEP_ON_TAIL + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL - return timeout; + return timeout; } void scheduling_functions_end_here(void) { } @@ -940,6 +1973,207 @@ if (lp.sched_priority < 0 || lp.sched_priority > 99) goto out_unlock; if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = 0; + p->policy = policy; + p->rt_priority = lp.sched_priority; + + current->need_resched = 1; + +out_unlock: + spin_unlock(&runqueue_lock); + read_unlock_irq(&tasklist_lock); + +out_nounlock: + return retval; +} + + +#include //for kmalloc and kfree + +/* Find the position and insert the srt task to the + * srt task list according to the values of qos_scaler + */ +void insert_srt_task_list(struct task_struct *p){ + struct task_struct * next_p=NULL; + struct list_head *tmp; + int found=0; + + list_for_each(tmp, &srt_taskqueue_head) { + next_p = list_entry(tmp, struct task_struct, srt_task_list); + //find the position to insert + if(next_p->qos_scaler > p->qos_scaler||(next_p->qos_scaler == p->qos_scaler&& next_p->prev_qos_scaler > p->prev_qos_scaler)||(next_p->qos_scaler == p->qos_scaler && next_p->prev_qos_scaler==p->prev_qos_scaler&&next_p->actual_period < p->actual_period)) + { + found=1; + break; + } + } + //insert to the srt task list + if(next_p!=NULL&&found==1) + list_add_tail(&p->srt_task_list, &next_p->srt_task_list); + else + list_add_tail(&p->srt_task_list, &srt_taskqueue_head); +} + +/* Convert a BE process into RT process + */ +static int set_rbed_rt(struct task_struct *p, struct sched_param lp){ + unsigned long low,high; + long save_rate,new_rate; + int rtn = 0; + + if(p->process_type == lp.process_type){ + printk("You can't set a process with the same process type\n"); + return -500; + } + + //save the BE resource rate + save_rate = p->wcet*1000/p->period; + + //get all the qos specifications + if (lp.nr_qos_level<=0){ + printk("Error qos specifications from user process\n"); + return -1000; + } + p->qos = (struct qos_struct *) kmalloc(lp.nr_qos_level*sizeof(struct qos_struct), GFP_KERNEL); + if (p->qos==0){ + printk("Couldn't kmalloc the size for qos specifications\n"); + return -2000; + } + + if (copy_from_user(p->qos, lp.qos, lp.nr_qos_level*sizeof(struct qos_struct))){ + printk("Couldn't copy the qos specifications from user space to kernel space\n"); + kfree(p->qos); + return -3000; + } + p->nr_qos_level=lp.nr_qos_level; + p->qos_scaler = lp.qos_scaler; //This will be set at application side + p->prev_qos_scaler = 1; + + p->ct_qos_level=0; + p->wcet = p->qos[0].wcet; + + //For BE and RT, actual_period always equal to period + //For SRT, actual_period may be not equal to period at runtime + p->actual_period = p->period = p->qos[0].period; + p->previous_period = p->actual_period; + new_rate = p->wcet*1000/p->period; + + //Admission control for HRT processes + if(lp.process_type == HRT){ + //if there is not enough resource for the HRT, it is rejected + if( new_rate > 1000 - ALPHA - BETA - rbed_target_rates[HRT]){ + kfree(p->qos); + return -4000; + } + } + + p->counter = p->wcet/tick + (p->wcet%tick==0?0:1);//?????? do we need this? + //Important: change qos level shoud change wcet and left_tsc + //init left-tsc as later as possible: BE -> SRT + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + + //Now convert the BE process to RT process + nr_running_rbed[p->process_type]--; + nr_task_rbed[p->process_type]--; //update the count of the live tasks in the system. + rbed_target_rates[p->process_type] -= save_rate; + list_del(&p->be_run_list); + + p->process_type = lp.process_type; //process type: HRT,SRT,BE + p->rbed_state = -1; //-1: meaning never running as RT before + p->srt_type = lp.srt_type; //srt type: MDSRT,RASRT,RBSRT + + list_add_tail(&p->rt_run_list, &rt_runqueue_head); + nr_running_rbed[p->process_type]++; + nr_task_rbed[p->process_type]++; + rbed_target_rates[p->process_type] += new_rate; + + //If it's SRT, we have to do something special here + if(p->process_type == SRT){ + //update the total srt benefits + total_srt_benefit += p->qos_scaler; + //Insert to the SRT task queue according to the qos scaler, i.e scaled benefit + insert_srt_task_list(p); + + //Update the total lowest and highest target rate of all SRTs + rbed_srt_rates_highest += p->qos[0].wcet*1000/p->qos[0].period; + rbed_srt_rates_lowest += p->qos[p->nr_qos_level-1].wcet*1000 + /p->qos[p->nr_qos_level-1].period; + + } + + rdtsc(low,high); + p->release_time= p->apic_start_tsc = ((u64)high<<32)+low; + + rbed_dra(1); //do resource re-allocation + + // should we do this, because the "BE -> RT" == "BE left the system"??? + min_be_release_time = p->release_time; + + if(rbed_rap == WRAP) + rtn = 0x0A; //10:use WRAP + else + rtn = p->ct_qos_level; //0..9:use QOS + + return rtn; +} + +static int set_rbed_schedule (pid_t pid, int policy, struct sched_param *param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + retval = -EINVAL; + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + spin_lock(&runqueue_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock; + + if (policy < 0) + policy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_OTHER&& + policy != SCHED_RBED) // 01/19/03 added by caixue lin + goto out_unlock; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid + * priority for SCHED_OTHER is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > 99) + goto out_unlock; + + // 01/19/03 the next sentence was modified by caixue lin + if ((policy == SCHED_OTHER||policy == SCHED_RBED) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; @@ -954,6 +2188,15 @@ p->policy = policy; p->rt_priority = lp.sched_priority; +/* 06/04/03 modified by caixue lin*/ + if(policy == SCHED_RBED){ + //convert the BE to RT + retval = set_rbed_rt(p,lp); + if(retval < 0) + goto out_unlock; + } +/* modification ends here*/ + current->need_resched = 1; out_unlock: @@ -964,6 +2207,25 @@ return retval; } +/*new rbed system calls from*/ +asmlinkage int sys_rbed_schedule(pid_t pid, int policy, struct sched_param *param){ + return set_rbed_schedule(pid, policy, param); +} + +asmlinkage int sys_rbed_deadline_met(pid_t pid) +{ + struct task_struct *p; + int retval; + + p = find_process_by_pid(pid); + + /* call sleep function */ + retval= rbed_sleep(p); + return retval; +} + +/* End of new rbed system calls added */ + asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) { @@ -1041,14 +2303,14 @@ #if CONFIG_SMP int i; - // Subtract non-idle processes running on other CPUs. + /* Subtract non-idle processes running on other CPUs. */ for (i = 0; i < smp_num_cpus; i++) { int cpu = cpu_logical_map(i); if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) nr_pending--; } #else - // on UP this process is on the runqueue as well + /* on UP this process is on the runqueue as well */ nr_pending--; #endif if (nr_pending) { @@ -1274,6 +2536,7 @@ this_task->cap_permitted = CAP_FULL_SET; this_task->keep_capabilities = 0; memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); + this_task->user = INIT_USER; spin_unlock(&runqueue_lock); @@ -1291,8 +2554,10 @@ /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them + * If we were started as result of loading a module, + * close all of the + * user space pages. We don't need them, and if we + * didn't close them * they would be locked into memory. */ exit_mm(current); @@ -1305,9 +2570,11 @@ exit_fs(current); /* current->fs->count--; */ fs = init_task.fs; + current->fs = fs; atomic_inc(&fs->count); exit_files(current); + current->files = init_task.files; atomic_inc(¤t->files->count); } @@ -1357,3 +2624,111 @@ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); } + +/*Following five system calls were added by caixue lin*/ +asmlinkage int sys_rbed_rt_tracedump(int *num, struct rbed_rt_trace *tr) +{ + int userstart; + int overflow = rt_trace_overflow; + + if (overflow) { + userstart = TRACE_SIZE - rt_trace_next; + copy_to_user(&tr[0], &rt_trace_data[rt_trace_next], + userstart * sizeof(struct rbed_rt_trace)); + + put_user(TRACE_SIZE, num); + } + else { + userstart=0; + put_user(rt_trace_next, num); + } + + copy_to_user(&tr[userstart], &rt_trace_data[0], + rt_trace_next * sizeof(struct rbed_rt_trace)); + + rt_trace_next=0; + rt_trace_overflow=0; + + return overflow; +} + +asmlinkage int sys_rbed_tracedump(int *num, struct rbed_trace *tr) +{ + int userstart; + int overflow = trace_overflow; + + + if (overflow) { + userstart = TRACE_SIZE - trace_next; + copy_to_user(&tr[0], &trace_data[trace_next], + userstart * sizeof(struct rbed_trace)); + + put_user(TRACE_SIZE, num); + } + else { + userstart=0; + put_user(trace_next, num); + } + + copy_to_user(&tr[userstart], &trace_data[0], + trace_next * sizeof(struct rbed_trace)); + + trace_next=0; + trace_overflow=0; + + return overflow; +} + +/* Adjust the rate for the process according to the rescaled + * benefit by the control error and return current actual period + * for current SRT process*/ +asmlinkage long sys_rate_adjust(long control_error) +{ + //save the prev qos scaler + current->prev_qos_scaler = current->qos_scaler; + + //if new error occurs + if(current->qos_scaler != control_error){ + //update the total srt benefits + total_srt_benefit -= current->qos_scaler; + current->qos_scaler = control_error; + total_srt_benefit += current->qos_scaler; + + //resort the srt list + list_del(¤t->srt_task_list); + insert_srt_task_list(current); + + rbed_dra(2); //do resource reallocation + + //reschedule the processes according to the new periods + current->need_resched = 1; + } + + return (long)(current->qos[current->ct_qos_level].period); +} + +/* Get the previous actual period for the current SRT process + * flag = 0, previous actual period, flag =1 , next expected period; + */ +asmlinkage long sys_getperiod(int flag){ + if(flag == 0){ + long previous_period = current->previous_period; + + //update previous period + current->previous_period = current->actual_period; + return (long)(previous_period); + } + else + return (long)(current->qos[current->ct_qos_level].period); +} + +/* Set qos_flag for Optimal or Proportional qos + * opti: qos_flag = 0; prop: qos_flag = 1. + */ +asmlinkage int sys_setqos(int flag){ + qos_flag = flag; + printk("qos_flag = %d\n", qos_flag); + + return flag; +} +/*add ends here*/ diff --unified --recursive --new-file linux-2.4.20/kernel/timer.c linux-2.4.20-rbed/kernel/timer.c --- linux-2.4.20/kernel/timer.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.20-rbed/kernel/timer.c 2004-03-28 22:25:11.000000000 -0800 @@ -599,8 +599,12 @@ update_one_process(p, user_tick, system, cpu); if (p->pid) { + //If apic timer is not ready, we use the normal Linux tick scheduling. + /*following line added by caixue lin*/ + if(!apic_timer_setup_flag){ if (--p->counter <= 0) { p->counter = 0; + /* * SCHED_FIFO is priority preemption, so this is * not the place to decide whether to reschedule a @@ -610,6 +614,7 @@ p->need_resched = 1; } } + } if (p->nice > 0) kstat.per_cpu_nice[cpu] += user_tick; else @@ -833,6 +838,47 @@ return current->pid; } + +//These are defined in sched.c +extern void move_be_last_runqueue(struct task_struct *p); + +/*Reset the scheduling parameters for a process when it wakes up from sleeping */ +void reset_process(struct task_struct *p) +{ + s64 now; + unsigned long low,high; + + rdtsc(low,high); + now = ((u64)high<<32) + low; + + if(p->process_type == 0){//for BE processes + //we extend the period immediately, but shorten it until next deadline + if(p->period < 60000*(nr_running_rbed[0])) + p->actual_period = p->period = 60000*(nr_running_rbed[0]); + p->release_time += p->actual_period*(cpu_khz/1000); + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + p->actual_period = p->period = 60000*(nr_running_rbed[0]); + move_be_last_runqueue(p); + + //re-adjust the release time if it is delayed + if(now > p->release_time + p->actual_period*(cpu_khz/1000)){ + p->release_time = now; + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + } + } + else{ //for RT processes + //advance the current release time with the previous period. + p->release_time += p->period*(cpu_khz/1000); + p->period = p->actual_period = p->qos[p->ct_qos_level].period; + p->wcet = p->qos[p->ct_qos_level].wcet; + p->apic_left_tsc = p->wcet*(cpu_khz/1000); + } + p->apic_start_tsc = now; + + //for tracing data and debugging + p->initial_release_time = p->release_time; +} + asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) { struct timespec t; @@ -855,6 +901,8 @@ * Its important on SMP not to do this holding locks. */ udelay((t.tv_nsec + 999) / 1000); + reset_process(current); + current->need_resched = 1; //or schedule(); return 0; } @@ -874,3 +922,133 @@ return 0; } +/* This code will be inserted in the Linux kernel, + * most likely timer.c */ +#include +#include +#include +#include +#define APIC_BRAKE 0x20 + +static DECLARE_WAIT_QUEUE_HEAD(apic_wq_head); +struct timer_info +{ + struct task_struct *p; + struct apic_timer_list timer; +}; +int apic_over(struct task_struct *); + +/* Convert microseconds to clock cycles used by the apic_timer functions*/ +inline unsigned long US_TO_CLOCK_CYCLES(unsigned long us) +{ + unsigned long CYCLES_PER_US = cpu_khz/1000; + unsigned long cycles = us * CYCLES_PER_US; //approximate + + return cycles; +} + +static void apic_awake(unsigned long long exp, unsigned long data) +{ + struct task_struct * p = (struct task_struct *) data; + wake_up_process(p); // this wakes up the sleeping process +} + +/* apic_sleep: + * 1) puts the currently executing process to sleep + * for the specified number of microsconds, using + * the high resolution apic_timer. + * 2) moves the sleeping process's task structure to + * a wait queue and call the scheduler (via sleep_on) + * to pick a new process to run. + */ +int apic_sleep( struct task_struct *p, unsigned long long cycles) +{ + unsigned long eax, edx; + unsigned long long now, when; + if(cycles < 500*(cpu_khz/1000)){//if less than 0.5ms, then use udelay. + udelay((unsigned long)(cycles + cpu_khz/2000)/(cpu_khz/1000));//round up to 1us + return 1; + } + + rdtsc(eax, edx); + now = TO_ULL(edx, eax); + when = now + cycles; + + p->apic_timer = (struct apic_timer_list *) kmalloc(sizeof(struct apic_timer_list), GFP_KERNEL); + if (p->apic_timer==0){ + printk("Couldn't kmalloc the size for apic timer\n"); + return -1; + } + init_apic_timer(p->apic_timer); + p->apic_timer->expires = when; + p->apic_timer->data = (unsigned long) p; + p->apic_timer->function = apic_awake; + + if(!add_apic_timer(p->apic_timer)) + printk("apic_sleep: error, can't add apic timer for process: %d\n", p->pid); + + sleep_on(&apic_wq_head); + + kfree(p->apic_timer); + return 0; +} + +/* apic_brake: + * this function is called when an apic_timer expires, + * i.e. the process uses up the wcet. In this case, a + * wrong wcet was given by the user aplication. + */ +static void apic_brake(unsigned long long exp, unsigned long data) +{ + + struct task_struct * p = (struct task_struct *) data; + + p->rbed_state = APIC_BRAKE; //overrun resource + p->apic_left_tsc = 0; + if(p->weight >0){ //for BE: only boost for the first pseudo period(110ms) + p->weight = 0 ; + } + reset_process(p); + + current->need_resched = 1; +} + +/* apic_start: + * Start the apic one-shot timer when the process is ready to + * execute after creation or selected to be the running process + */ +struct apic_timer_list apic_timer; +int apic_start(struct task_struct *p, long long cycles) +{ + unsigned long eax, edx; + unsigned long long now, when; + + rdtsc(eax, edx); + now = TO_ULL(edx, eax); + when = now + cycles; + p->apic_start_tsc = now; + + init_apic_timer(&apic_timer); + apic_timer.expires = when; + apic_timer.data = (unsigned long) p; + apic_timer.function = (void *) apic_brake; + + if(!add_apic_timer(&apic_timer)) + printk("apic_start: error, can't add apic timer for process: %d\n", p->pid); + + return 0; +} + +/* stop a apic one-shot timer specified by a process*/ +int apic_over(struct task_struct *p) +{ + int rtn=0; + rtn = del_apic_timer(&apic_timer); + return rtn; +} + +asmlinkage void sys_test_oneshot(unsigned long useconds) +{ + apic_sleep(current, useconds); +} +